From 59d66bc2bb8e00f38f255ec54d283e69b04981bc Mon Sep 17 00:00:00 2001 From: Jake Massimo Date: Mon, 15 Jun 2026 20:27:46 +0000 Subject: [PATCH] x86_64: Add AVX2 assembly for polyz_unpack with HOL-Light proofs Replace the AVX2 C intrinsics for polyz_unpack with fully-unrolled hand-written assembly, mirroring the existing AArch64 conversion, and add HOL-Light functional-correctness and memory-safety proofs together with CBMC contracts. Both variants are covered: - polyz_unpack_17 (GAMMA1 = 2^17, 18-bit packed, ML-DSA-44) - polyz_unpack_19 (GAMMA1 = 2^19, 20-bit packed, ML-DSA-65/87) Each routine unpacks 8 coefficients per block (VPSHUFB/VPSRLVD/VPAND/ VPSUBD) and builds the shuffle, shift, mask and gamma1 constants inline, so it takes no table argument. Resolves #925. Resolves #915. Signed-off-by: Jake Massimo --- .github/workflows/hol_light.yml | 4 + BIBLIOGRAPHY.md | 10 +- dev/x86_64/meta.h | 4 +- dev/x86_64/src/arith_native_x86_64.h | 26 +- dev/x86_64/src/polyz_unpack_17_avx2.c | 94 -- dev/x86_64/src/polyz_unpack_17_avx2_asm.S | 135 +++ dev/x86_64/src/polyz_unpack_19_avx2.c | 96 -- dev/x86_64/src/polyz_unpack_19_avx2_asm.S | 135 +++ mldsa/mldsa_native.c | 6 +- mldsa/mldsa_native_asm.S | 6 +- mldsa/src/native/x86_64/meta.h | 4 +- .../native/x86_64/src/arith_native_x86_64.h | 26 +- .../native/x86_64/src/polyz_unpack_17_avx2.c | 94 -- .../x86_64/src/polyz_unpack_17_avx2_asm.S | 342 +++++++ .../native/x86_64/src/polyz_unpack_19_avx2.c | 96 -- .../x86_64/src/polyz_unpack_19_avx2_asm.S | 342 +++++++ .../cbmc/polyz_unpack_native_x86_64/Makefile | 49 + .../polyz_unpack_native_x86_64_harness.c | 25 + proofs/hol_light/README.md | 2 + proofs/hol_light/x86_64/Makefile | 2 + .../x86_64/mldsa/polyz_unpack_17_avx2_asm.S | 333 +++++++ .../x86_64/mldsa/polyz_unpack_19_avx2_asm.S | 333 +++++++ .../hol_light/x86_64/proofs/dump_bytecode.ml | 8 + proofs/hol_light/x86_64/proofs/mldsa_utils.ml | 88 ++ .../x86_64/proofs/polyz_unpack_17_avx2_asm.ml | 909 +++++++++++++++++ .../x86_64/proofs/polyz_unpack_19_avx2_asm.ml | 911 ++++++++++++++++++ .../x86_64/proofs/subroutine_signatures.ml | 30 + scripts/autogen | 12 + 28 files changed, 3720 insertions(+), 402 deletions(-) delete mode 100644 dev/x86_64/src/polyz_unpack_17_avx2.c create mode 100644 dev/x86_64/src/polyz_unpack_17_avx2_asm.S delete mode 100644 dev/x86_64/src/polyz_unpack_19_avx2.c create mode 100644 dev/x86_64/src/polyz_unpack_19_avx2_asm.S delete mode 100644 mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c create mode 100644 mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S delete mode 100644 mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c create mode 100644 mldsa/src/native/x86_64/src/polyz_unpack_19_avx2_asm.S create mode 100644 proofs/cbmc/polyz_unpack_native_x86_64/Makefile create mode 100644 proofs/cbmc/polyz_unpack_native_x86_64/polyz_unpack_native_x86_64_harness.c create mode 100644 proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S create mode 100644 proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S create mode 100644 proofs/hol_light/x86_64/proofs/polyz_unpack_17_avx2_asm.ml create mode 100644 proofs/hol_light/x86_64/proofs/polyz_unpack_19_avx2_asm.ml diff --git a/.github/workflows/hol_light.yml b/.github/workflows/hol_light.yml index 5458ed57b..c27941f20 100644 --- a/.github/workflows/hol_light.yml +++ b/.github/workflows/hol_light.yml @@ -212,6 +212,10 @@ jobs: needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"] - name: poly_chknorm_avx2_asm needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"] + - name: polyz_unpack_17_avx2_asm + needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"] + - name: polyz_unpack_19_avx2_asm + needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"] - name: ntt_avx2_asm needs: ["mldsa_specs.ml", "mldsa_utils.ml", "mldsa_zetas.ml", "subroutine_signatures.ml"] - name: intt_avx2_asm diff --git a/BIBLIOGRAPHY.md b/BIBLIOGRAPHY.md index c8ba6b827..7ac72576f 100644 --- a/BIBLIOGRAPHY.md +++ b/BIBLIOGRAPHY.md @@ -270,8 +270,8 @@ source code and documentation. - [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c) - [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c) - [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c) - - [dev/x86_64/src/polyz_unpack_17_avx2.c](dev/x86_64/src/polyz_unpack_17_avx2.c) - - [dev/x86_64/src/polyz_unpack_19_avx2.c](dev/x86_64/src/polyz_unpack_19_avx2.c) + - [dev/x86_64/src/polyz_unpack_17_avx2_asm.S](dev/x86_64/src/polyz_unpack_17_avx2_asm.S) + - [dev/x86_64/src/polyz_unpack_19_avx2_asm.S](dev/x86_64/src/polyz_unpack_19_avx2_asm.S) - [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c) - [dev/x86_64/src/rej_uniform_eta2_avx2.c](dev/x86_64/src/rej_uniform_eta2_avx2.c) - [dev/x86_64/src/rej_uniform_eta4_avx2.c](dev/x86_64/src/rej_uniform_eta4_avx2.c) @@ -288,8 +288,8 @@ source code and documentation. - [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c) - [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c) - [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c) - - [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c) - - [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c) + - [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S) + - [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2_asm.S](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2_asm.S) - [mldsa/src/native/x86_64/src/rej_uniform_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_avx2.c) - [mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c) - [mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c) @@ -302,6 +302,8 @@ source code and documentation. - [proofs/hol_light/x86_64/mldsa/pointwise_avx2_asm.S](proofs/hol_light/x86_64/mldsa/pointwise_avx2_asm.S) - [proofs/hol_light/x86_64/mldsa/poly_caddq_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_caddq_avx2_asm.S) - [proofs/hol_light/x86_64/mldsa/poly_chknorm_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_chknorm_avx2_asm.S) + - [proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S](proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S) + - [proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S](proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S) ### `Round3_Spec` diff --git a/dev/x86_64/meta.h b/dev/x86_64/meta.h index 2d3df8400..55924ffec 100644 --- a/dev/x86_64/meta.h +++ b/dev/x86_64/meta.h @@ -235,7 +235,7 @@ static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_polyz_unpack_17_avx2(r, a); + mld_polyz_unpack_17_avx2_asm(r, a); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ @@ -250,7 +250,7 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_polyz_unpack_19_avx2(r, a); + mld_polyz_unpack_19_avx2_asm(r, a); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ diff --git a/dev/x86_64/src/arith_native_x86_64.h b/dev/x86_64/src/arith_native_x86_64.h index 592ca6999..6ec3c1434 100644 --- a/dev/x86_64/src/arith_native_x86_64.h +++ b/dev/x86_64/src/arith_native_x86_64.h @@ -138,11 +138,29 @@ __contract__( ); #if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) -#define mld_polyz_unpack_17_avx2 MLD_NAMESPACE(mld_polyz_unpack_17_avx2) -void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a); +#define mld_polyz_unpack_17_avx2_asm MLD_NAMESPACE(polyz_unpack_17_avx2_asm) +MLD_SYSV_ABI +void mld_polyz_unpack_17_avx2_asm(int32_t *r, const uint8_t *a) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/polyz_unpack_17_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a, 576)) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(array_bound(r, 0, MLDSA_N, -((1 << 17) - 1), (1 << 17) + 1)) +); -#define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2) -void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a); +#define mld_polyz_unpack_19_avx2_asm MLD_NAMESPACE(polyz_unpack_19_avx2_asm) +MLD_SYSV_ABI +void mld_polyz_unpack_19_avx2_asm(int32_t *r, const uint8_t *a) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/polyz_unpack_19_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a, 640)) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(array_bound(r, 0, MLDSA_N, -((1 << 19) - 1), (1 << 19) + 1)) +); #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ #define mld_pointwise_avx2_asm MLD_NAMESPACE(pointwise_avx2_asm) diff --git a/dev/x86_64/src/polyz_unpack_17_avx2.c b/dev/x86_64/src/polyz_unpack_17_avx2.c deleted file mode 100644 index ac98bf64f..000000000 --- a/dev/x86_64/src/polyz_unpack_17_avx2.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - (!defined(MLD_CONFIG_NO_SIGN_API) || \ - !defined(MLD_CONFIG_NO_VERIFY_API)) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - MLD_CONFIG_PARAMETER_SET == 44) - -#include -#include "arith_native_x86_64.h" - -void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a) -{ - unsigned int i; - __m256i f; - __m128i low, high; - - const __m256i shufbidx = _mm256_set_epi8( - -1, 31, 30, 29, -1, 29, 28, 27, -1, 27, 26, 25, -1, 25, 24, 23, -1, 8, 7, - 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0); - const __m256i srlvdidx = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0); - const __m256i mask = _mm256_set1_epi32(0x3FFFF); - const __m256i gamma1 = _mm256_set1_epi32((1 << 17)); - - for (i = 0; i < MLDSA_N / 8; i++) - { - /* Load bytes 0..15 into low 128-bit vector */ - low = _mm_loadu_si128((__m128i *)&a[18 * i]); - /* Load bytes 2..17 into high 128-bit vector */ - high = _mm_loadu_si128((__m128i *)&a[18 * i + 2]); - /* Combine into 256-bit vector */ - f = _mm256_inserti128_si256(_mm256_castsi128_si256(low), high, 1); - - /* Shuffling 8-bit lanes - * - * ┌─ Indices 0-8 into low 128-bit half ───────────────────────────────────┐ - * │ Shuffle: [-1, 8, 7, 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0] │ - * │ Result: [0, byte8, byte7, byte6, ..., 0, byte2, byte1, byte0] │ - * └───────────────────────────────────────────────────────────────────────┘ - * - * ┌─ Indices 16-31 into high 128-bit half ────────────────────────────────┐ - * │ Shuffle: [-1,31, 30, 29, -1,29, 28, 27, -1,27, 26, 25, -1,25, 24, 23] │ - * │ Result: [0, byte17, byte16, byte15, ..., 0, byte11, byte10, byte9] │ - * └───────────────────────────────────────────────────────────────────────┘ - */ - f = _mm256_shuffle_epi8(f, shufbidx); - - /* Keep only 18 out of 24 bits in each 32-bit lane */ - /* Bits 0..23 16..39 32..55 48..71 - * 72..95 88..111 104..127 120..143 */ - f = _mm256_srlv_epi32(f, srlvdidx); - /* Bits 0..23 18..39 36..55 54..71 - * 72..95 90..111 108..127 126..143 */ - f = _mm256_and_si256(f, mask); - /* Bits 0..17 18..35 36..53 54..71 - * 72..89 90..107 108..125 126..143 */ - - /* Map [0, 1, ..., 2^18-1] to [2^17, 2^17-1, ..., -2^17+1] */ - f = _mm256_sub_epi32(gamma1, f); - - _mm256_store_si256((__m256i *)&r[8 * i], f); - } -} -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ - !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \ - */ - -MLD_EMPTY_CU(avx2_polyz_unpack_17) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ - !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ - 44)) */ diff --git a/dev/x86_64/src/polyz_unpack_17_avx2_asm.S b/dev/x86_64/src/polyz_unpack_17_avx2_asm.S new file mode 100644 index 000000000..dd32c1bac --- /dev/null +++ b/dev/x86_64/src/polyz_unpack_17_avx2_asm.S @@ -0,0 +1,135 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_polyz_unpack_17_avx2_asm + * + * Description: Unpack polynomial z with 18-bit packed coefficients + * (GAMMA1 = 2^17). Maps packed [0, 2^18-1] to signed + * [-(2^17-1), 2^17] via GAMMA1 - x. + * + * Arguments: - int32_t *r: pointer to output polynomial (1024 bytes) + * - const uint8_t *a: pointer to packed input (576 bytes) + **************************************************/ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + (!defined(MLD_CONFIG_NO_SIGN_API) || \ + !defined(MLD_CONFIG_NO_VERIFY_API)) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + MLD_CONFIG_PARAMETER_SET == 44) + +/* simpasm: header-end */ + +/* Unpack one block of 8 coefficients (18 packed input bytes -> 8 x int32). + * + * Load bytes [18i .. 18i+15] into the low 128-bit half and [18i+2 .. 18i+17] + * into the high 128-bit half, then replicate the AVX2 intrinsic pipeline: + * byte shuffle, per-lane variable right shift, mask to 18 bits, and + * gamma1 - x. */ +.macro unpack17_block i +vmovdqu (18*\i)(%rsi), %xmm0 +vmovdqu (18*\i + 2)(%rsi), %xmm5 +vinserti128 $1, %xmm5, %ymm0, %ymm0 +vpshufb %ymm1, %ymm0, %ymm0 +vpsrlvd %ymm2, %ymm0, %ymm0 +vpand %ymm3, %ymm0, %ymm0 +vpsubd %ymm0, %ymm4, %ymm0 +vmovdqa %ymm0, (32*\i)(%rdi) +.endm + +.text +.global MLD_ASM_NAMESPACE(polyz_unpack_17_avx2_asm) +.balign 16 +MLD_ASM_FN_SYMBOL(polyz_unpack_17_avx2_asm) + +/* Build the 256-bit byte-shuffle constant (shufbidx) in %ymm1. + * Low 128: {00,01,02,FF, 02,03,04,FF, 04,05,06,FF, 06,07,08,FF} + * High 128: {17,18,19,FF, 19,1A,1B,FF, 1B,1C,1D,FF, 1D,1E,1F,FF} */ +movabs $0xFF040302FF020100, %rax +vmovq %rax, %xmm1 +movabs $0xFF080706FF060504, %rax +vpinsrq $1, %rax, %xmm1, %xmm1 +movabs $0xFF1B1A19FF191817, %rax +vmovq %rax, %xmm5 +movabs $0xFF1F1E1DFF1D1C1B, %rax +vpinsrq $1, %rax, %xmm5, %xmm5 +vinserti128 $1, %xmm5, %ymm1, %ymm1 + +/* Build the per-lane right-shift constant (srlvdidx) = (6,4,2,0,6,4,2,0) + * in %ymm2; the two 128-bit halves are identical. */ +movabs $0x0000000200000000, %rax +vmovq %rax, %xmm2 +movabs $0x0000000600000004, %rax +vpinsrq $1, %rax, %xmm2, %xmm2 +vinserti128 $1, %xmm2, %ymm2, %ymm2 + +/* Build the 18-bit mask (0x3FFFF) broadcast into %ymm3. */ +mov $0x3FFFF, %eax +vmovd %eax, %xmm3 +vpbroadcastd %xmm3, %ymm3 + +/* Build gamma1 = 2^17 (0x20000) broadcast into %ymm4. */ +mov $0x20000, %eax +vmovd %eax, %xmm4 +vpbroadcastd %xmm4, %ymm4 + +unpack17_block 0 +unpack17_block 1 +unpack17_block 2 +unpack17_block 3 +unpack17_block 4 +unpack17_block 5 +unpack17_block 6 +unpack17_block 7 +unpack17_block 8 +unpack17_block 9 +unpack17_block 10 +unpack17_block 11 +unpack17_block 12 +unpack17_block 13 +unpack17_block 14 +unpack17_block 15 +unpack17_block 16 +unpack17_block 17 +unpack17_block 18 +unpack17_block 19 +unpack17_block 20 +unpack17_block 21 +unpack17_block 22 +unpack17_block 23 +unpack17_block 24 +unpack17_block 25 +unpack17_block 26 +unpack17_block 27 +unpack17_block 28 +unpack17_block 29 +unpack17_block 30 +unpack17_block 31 +ret + +/* simpasm: footer-start */ + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ + !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \ + */ diff --git a/dev/x86_64/src/polyz_unpack_19_avx2.c b/dev/x86_64/src/polyz_unpack_19_avx2.c deleted file mode 100644 index fe5c3895b..000000000 --- a/dev/x86_64/src/polyz_unpack_19_avx2.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - (!defined(MLD_CONFIG_NO_SIGN_API) || \ - !defined(MLD_CONFIG_NO_VERIFY_API)) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) - -#include -#include "arith_native_x86_64.h" - -void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a) -{ - unsigned int i; - __m256i f; - __m128i low, high; - - const __m256i shufbidx = _mm256_set_epi8( - -1, 31, 30, 29, -1, 29, 28, 27, -1, 26, 25, 24, -1, 24, 23, 22, -1, 9, 8, - 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0); - /* Equivalent to _mm256_set_epi32(4, 0, 4, 0, 4, 0, 4, 0) */ - const __m256i srlvdidx = _mm256_set1_epi64x((uint64_t)4 << 32); - const __m256i mask = _mm256_set1_epi32(0xFFFFF); - const __m256i gamma1 = _mm256_set1_epi32((1 << 19)); - - for (i = 0; i < MLDSA_N / 8; i++) - { - /* Load bytes 0..15 into low 128-bit vector */ - low = _mm_loadu_si128((__m128i *)&a[20 * i]); - /* Load bytes 4..19 into high 128-bit vector */ - high = _mm_loadu_si128((__m128i *)&a[20 * i + 4]); - /* Combine into 256-bit vector */ - f = _mm256_inserti128_si256(_mm256_castsi128_si256(low), high, 1); - - /* Shuffling 8-bit lanes - * - * ┌─ Indices 0-9 into low 128-bit half ───────────────────────────────────┐ - * │ Shuffle: [-1, 9, 8, 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0] │ - * │ Result: [0, byte9, byte8, byte7, ..., 0, byte2, byte1, byte0] │ - * └───────────────────────────────────────────────────────────────────────┘ - * - * ┌─ Indices 16-31 into high 128-bit half ────────────────────────────────┐ - * │ Shuffle: [-1,31, 30, 29, -1,29, 28, 27, -1,26, 25, 24, -1,24, 23, 22] │ - * │ Result: [0, byte19, byte18, byte17, ..., 0, byte12, byte11, byte10] │ - * └───────────────────────────────────────────────────────────────────────┘ - */ - f = _mm256_shuffle_epi8(f, shufbidx); - - /* Keep only 20 out of 24 bits in each 32-bit lane */ - /* Bits 0..23 16..39 40..63 56..79 - * 80..103 96..119 120..143 136..159 */ - f = _mm256_srlv_epi32(f, srlvdidx); - /* Bits 0..23 20..39 40..63 60..79 - * 80..103 100..119 120..143 140..159 */ - f = _mm256_and_si256(f, mask); - /* Bits 0..19 20..39 40..59 60..79 - * 80..99 100..119 120..139 140..159 */ - - /* Map [0, 1, ..., 2^20-1] to [2^19, 2^19-1, ..., -2^19+1] */ - f = _mm256_sub_epi32(gamma1, f); - - _mm256_store_si256((__m256i *)&r[8 * i], f); - } -} - -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ - !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ - || MLD_CONFIG_PARAMETER_SET == 87) */ - -MLD_EMPTY_CU(avx2_polyz_unpack_19) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ - !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ - || MLD_CONFIG_PARAMETER_SET == 87)) */ diff --git a/dev/x86_64/src/polyz_unpack_19_avx2_asm.S b/dev/x86_64/src/polyz_unpack_19_avx2_asm.S new file mode 100644 index 000000000..2d8d8f529 --- /dev/null +++ b/dev/x86_64/src/polyz_unpack_19_avx2_asm.S @@ -0,0 +1,135 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_polyz_unpack_19_avx2_asm + * + * Description: Unpack polynomial z with 20-bit packed coefficients + * (GAMMA1 = 2^19). Maps packed [0, 2^20-1] to signed + * [-(2^19-1), 2^19] via GAMMA1 - x. + * + * Arguments: - int32_t *r: pointer to output polynomial (1024 bytes) + * - const uint8_t *a: pointer to packed input (640 bytes) + **************************************************/ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + (!defined(MLD_CONFIG_NO_SIGN_API) || \ + !defined(MLD_CONFIG_NO_VERIFY_API)) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) + +/* simpasm: header-end */ + +/* Unpack one block of 8 coefficients (20 packed input bytes -> 8 x int32). + * + * Load bytes [20i .. 20i+15] into the low 128-bit half and [20i+4 .. 20i+19] + * into the high 128-bit half, then replicate the AVX2 intrinsic pipeline: + * byte shuffle, per-lane variable right shift, mask to 20 bits, and + * gamma1 - x. */ +.macro unpack19_block i +vmovdqu (20*\i)(%rsi), %xmm0 +vmovdqu (20*\i + 4)(%rsi), %xmm5 +vinserti128 $1, %xmm5, %ymm0, %ymm0 +vpshufb %ymm1, %ymm0, %ymm0 +vpsrlvd %ymm2, %ymm0, %ymm0 +vpand %ymm3, %ymm0, %ymm0 +vpsubd %ymm0, %ymm4, %ymm0 +vmovdqa %ymm0, (32*\i)(%rdi) +.endm + +.text +.global MLD_ASM_NAMESPACE(polyz_unpack_19_avx2_asm) +.balign 16 +MLD_ASM_FN_SYMBOL(polyz_unpack_19_avx2_asm) + +/* Build the 256-bit byte-shuffle constant (shufbidx) in %ymm1. + * Low 128: {00,01,02,FF, 02,03,04,FF, 05,06,07,FF, 07,08,09,FF} + * High 128: {16,17,18,FF, 18,19,1A,FF, 1B,1C,1D,FF, 1D,1E,1F,FF} */ +movabs $0xFF040302FF020100, %rax +vmovq %rax, %xmm1 +movabs $0xFF090807FF070605, %rax +vpinsrq $1, %rax, %xmm1, %xmm1 +movabs $0xFF1A1918FF181716, %rax +vmovq %rax, %xmm5 +movabs $0xFF1F1E1DFF1D1C1B, %rax +vpinsrq $1, %rax, %xmm5, %xmm5 +vinserti128 $1, %xmm5, %ymm1, %ymm1 + +/* Build the per-lane right-shift constant (srlvdidx) = (4,0,4,0,4,0,4,0) + * in %ymm2; the two 128-bit halves are identical. */ +movabs $0x0000000400000000, %rax +vmovq %rax, %xmm2 +movabs $0x0000000400000000, %rax +vpinsrq $1, %rax, %xmm2, %xmm2 +vinserti128 $1, %xmm2, %ymm2, %ymm2 + +/* Build the 20-bit mask (0xFFFFF) broadcast into %ymm3. */ +mov $0xFFFFF, %eax +vmovd %eax, %xmm3 +vpbroadcastd %xmm3, %ymm3 + +/* Build gamma1 = 2^19 (0x80000) broadcast into %ymm4. */ +mov $0x80000, %eax +vmovd %eax, %xmm4 +vpbroadcastd %xmm4, %ymm4 + +unpack19_block 0 +unpack19_block 1 +unpack19_block 2 +unpack19_block 3 +unpack19_block 4 +unpack19_block 5 +unpack19_block 6 +unpack19_block 7 +unpack19_block 8 +unpack19_block 9 +unpack19_block 10 +unpack19_block 11 +unpack19_block 12 +unpack19_block 13 +unpack19_block 14 +unpack19_block 15 +unpack19_block 16 +unpack19_block 17 +unpack19_block 18 +unpack19_block 19 +unpack19_block 20 +unpack19_block 21 +unpack19_block 22 +unpack19_block 23 +unpack19_block 24 +unpack19_block 25 +unpack19_block 26 +unpack19_block 27 +unpack19_block 28 +unpack19_block 29 +unpack19_block 30 +unpack19_block 31 +ret + +/* simpasm: footer-start */ + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ + !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87) */ diff --git a/mldsa/mldsa_native.c b/mldsa/mldsa_native.c index 4e296faae..9365ed369 100644 --- a/mldsa/mldsa_native.c +++ b/mldsa/mldsa_native.c @@ -87,8 +87,6 @@ #include "src/native/x86_64/src/poly_decompose_88_avx2.c" #include "src/native/x86_64/src/poly_use_hint_32_avx2.c" #include "src/native/x86_64/src/poly_use_hint_88_avx2.c" -#include "src/native/x86_64/src/polyz_unpack_17_avx2.c" -#include "src/native/x86_64/src/polyz_unpack_19_avx2.c" #include "src/native/x86_64/src/rej_uniform_avx2.c" #include "src/native/x86_64/src/rej_uniform_eta2_avx2.c" #include "src/native/x86_64/src/rej_uniform_eta4_avx2.c" @@ -791,8 +789,8 @@ #undef mld_poly_decompose_88_avx2 #undef mld_poly_use_hint_32_avx2 #undef mld_poly_use_hint_88_avx2 -#undef mld_polyz_unpack_17_avx2 -#undef mld_polyz_unpack_19_avx2 +#undef mld_polyz_unpack_17_avx2_asm +#undef mld_polyz_unpack_19_avx2_asm #undef mld_rej_uniform_avx2 #undef mld_rej_uniform_eta2_avx2 #undef mld_rej_uniform_eta4_avx2 diff --git a/mldsa/mldsa_native_asm.S b/mldsa/mldsa_native_asm.S index 999871705..4877d5156 100644 --- a/mldsa/mldsa_native_asm.S +++ b/mldsa/mldsa_native_asm.S @@ -88,6 +88,8 @@ #include "src/native/x86_64/src/pointwise_avx2_asm.S" #include "src/native/x86_64/src/poly_caddq_avx2_asm.S" #include "src/native/x86_64/src/poly_chknorm_avx2_asm.S" +#include "src/native/x86_64/src/polyz_unpack_17_avx2_asm.S" +#include "src/native/x86_64/src/polyz_unpack_19_avx2_asm.S" #endif /* MLD_SYS_X86_64 */ #endif /* MLD_CONFIG_USE_NATIVE_BACKEND_ARITH */ @@ -800,8 +802,8 @@ #undef mld_poly_decompose_88_avx2 #undef mld_poly_use_hint_32_avx2 #undef mld_poly_use_hint_88_avx2 -#undef mld_polyz_unpack_17_avx2 -#undef mld_polyz_unpack_19_avx2 +#undef mld_polyz_unpack_17_avx2_asm +#undef mld_polyz_unpack_19_avx2_asm #undef mld_rej_uniform_avx2 #undef mld_rej_uniform_eta2_avx2 #undef mld_rej_uniform_eta4_avx2 diff --git a/mldsa/src/native/x86_64/meta.h b/mldsa/src/native/x86_64/meta.h index 2d3df8400..55924ffec 100644 --- a/mldsa/src/native/x86_64/meta.h +++ b/mldsa/src/native/x86_64/meta.h @@ -235,7 +235,7 @@ static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_polyz_unpack_17_avx2(r, a); + mld_polyz_unpack_17_avx2_asm(r, a); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \ @@ -250,7 +250,7 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a) { return MLD_NATIVE_FUNC_FALLBACK; } - mld_polyz_unpack_19_avx2(r, a); + mld_polyz_unpack_19_avx2_asm(r, a); return MLD_NATIVE_FUNC_SUCCESS; } #endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ diff --git a/mldsa/src/native/x86_64/src/arith_native_x86_64.h b/mldsa/src/native/x86_64/src/arith_native_x86_64.h index 592ca6999..6ec3c1434 100644 --- a/mldsa/src/native/x86_64/src/arith_native_x86_64.h +++ b/mldsa/src/native/x86_64/src/arith_native_x86_64.h @@ -138,11 +138,29 @@ __contract__( ); #if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API) -#define mld_polyz_unpack_17_avx2 MLD_NAMESPACE(mld_polyz_unpack_17_avx2) -void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a); +#define mld_polyz_unpack_17_avx2_asm MLD_NAMESPACE(polyz_unpack_17_avx2_asm) +MLD_SYSV_ABI +void mld_polyz_unpack_17_avx2_asm(int32_t *r, const uint8_t *a) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/polyz_unpack_17_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a, 576)) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(array_bound(r, 0, MLDSA_N, -((1 << 17) - 1), (1 << 17) + 1)) +); -#define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2) -void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a); +#define mld_polyz_unpack_19_avx2_asm MLD_NAMESPACE(polyz_unpack_19_avx2_asm) +MLD_SYSV_ABI +void mld_polyz_unpack_19_avx2_asm(int32_t *r, const uint8_t *a) +/* This must be kept in sync with the HOL-Light specification + * in proofs/hol_light/x86_64/proofs/polyz_unpack_19_avx2_asm.ml */ +__contract__( + requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N)) + requires(memory_no_alias(a, 640)) + assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N)) + ensures(array_bound(r, 0, MLDSA_N, -((1 << 19) - 1), (1 << 19) + 1)) +); #endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */ #define mld_pointwise_avx2_asm MLD_NAMESPACE(pointwise_avx2_asm) diff --git a/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c b/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c deleted file mode 100644 index ac98bf64f..000000000 --- a/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - (!defined(MLD_CONFIG_NO_SIGN_API) || \ - !defined(MLD_CONFIG_NO_VERIFY_API)) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - MLD_CONFIG_PARAMETER_SET == 44) - -#include -#include "arith_native_x86_64.h" - -void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a) -{ - unsigned int i; - __m256i f; - __m128i low, high; - - const __m256i shufbidx = _mm256_set_epi8( - -1, 31, 30, 29, -1, 29, 28, 27, -1, 27, 26, 25, -1, 25, 24, 23, -1, 8, 7, - 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0); - const __m256i srlvdidx = _mm256_set_epi32(6, 4, 2, 0, 6, 4, 2, 0); - const __m256i mask = _mm256_set1_epi32(0x3FFFF); - const __m256i gamma1 = _mm256_set1_epi32((1 << 17)); - - for (i = 0; i < MLDSA_N / 8; i++) - { - /* Load bytes 0..15 into low 128-bit vector */ - low = _mm_loadu_si128((__m128i *)&a[18 * i]); - /* Load bytes 2..17 into high 128-bit vector */ - high = _mm_loadu_si128((__m128i *)&a[18 * i + 2]); - /* Combine into 256-bit vector */ - f = _mm256_inserti128_si256(_mm256_castsi128_si256(low), high, 1); - - /* Shuffling 8-bit lanes - * - * ┌─ Indices 0-8 into low 128-bit half ───────────────────────────────────┐ - * │ Shuffle: [-1, 8, 7, 6, -1, 6, 5, 4, -1, 4, 3, 2, -1, 2, 1, 0] │ - * │ Result: [0, byte8, byte7, byte6, ..., 0, byte2, byte1, byte0] │ - * └───────────────────────────────────────────────────────────────────────┘ - * - * ┌─ Indices 16-31 into high 128-bit half ────────────────────────────────┐ - * │ Shuffle: [-1,31, 30, 29, -1,29, 28, 27, -1,27, 26, 25, -1,25, 24, 23] │ - * │ Result: [0, byte17, byte16, byte15, ..., 0, byte11, byte10, byte9] │ - * └───────────────────────────────────────────────────────────────────────┘ - */ - f = _mm256_shuffle_epi8(f, shufbidx); - - /* Keep only 18 out of 24 bits in each 32-bit lane */ - /* Bits 0..23 16..39 32..55 48..71 - * 72..95 88..111 104..127 120..143 */ - f = _mm256_srlv_epi32(f, srlvdidx); - /* Bits 0..23 18..39 36..55 54..71 - * 72..95 90..111 108..127 126..143 */ - f = _mm256_and_si256(f, mask); - /* Bits 0..17 18..35 36..53 54..71 - * 72..89 90..107 108..125 126..143 */ - - /* Map [0, 1, ..., 2^18-1] to [2^17, 2^17-1, ..., -2^17+1] */ - f = _mm256_sub_epi32(gamma1, f); - - _mm256_store_si256((__m256i *)&r[8 * i], f); - } -} -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ - !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \ - */ - -MLD_EMPTY_CU(avx2_polyz_unpack_17) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ - !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \ - 44)) */ diff --git a/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S b/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S new file mode 100644 index 000000000..ceb763ab3 --- /dev/null +++ b/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S @@ -0,0 +1,342 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_polyz_unpack_17_avx2_asm + * + * Description: Unpack polynomial z with 18-bit packed coefficients + * (GAMMA1 = 2^17). Maps packed [0, 2^18-1] to signed + * [-(2^17-1), 2^17] via GAMMA1 - x. + * + * Arguments: - int32_t *r: pointer to output polynomial (1024 bytes) + * - const uint8_t *a: pointer to packed input (576 bytes) + **************************************************/ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + (!defined(MLD_CONFIG_NO_SIGN_API) || \ + !defined(MLD_CONFIG_NO_VERIFY_API)) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + MLD_CONFIG_PARAMETER_SET == 44) + + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/polyz_unpack_17_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(polyz_unpack_17_avx2_asm) +MLD_ASM_FN_SYMBOL(polyz_unpack_17_avx2_asm) + + .cfi_startproc + movabsq $-0xfbfcfd00fdff00, %rax # imm = 0xFF040302FF020100 + vmovq %rax, %xmm1 + movabsq $-0xf7f8f900f9fafc, %rax # imm = 0xFF080706FF060504 + vpinsrq $0x1, %rax, %xmm1, %xmm1 + movabsq $-0xe4e5e600e6e7e9, %rax # imm = 0xFF1B1A19FF191817 + vmovq %rax, %xmm5 + movabsq $-0xe0e1e200e2e3e5, %rax # imm = 0xFF1F1E1DFF1D1C1B + vpinsrq $0x1, %rax, %xmm5, %xmm5 + vinserti128 $0x1, %xmm5, %ymm1, %ymm1 + movabsq $0x200000000, %rax # imm = 0x200000000 + vmovq %rax, %xmm2 + movabsq $0x600000004, %rax # imm = 0x600000004 + vpinsrq $0x1, %rax, %xmm2, %xmm2 + vinserti128 $0x1, %xmm2, %ymm2, %ymm2 + movl $0x3ffff, %eax # imm = 0x3FFFF + vmovd %eax, %xmm3 + vpbroadcastd %xmm3, %ymm3 + movl $0x20000, %eax # imm = 0x20000 + vmovd %eax, %xmm4 + vpbroadcastd %xmm4, %ymm4 + vmovdqu (%rsi), %xmm0 + vmovdqu 0x2(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, (%rdi) + vmovdqu 0x12(%rsi), %xmm0 + vmovdqu 0x14(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x20(%rdi) + vmovdqu 0x24(%rsi), %xmm0 + vmovdqu 0x26(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x40(%rdi) + vmovdqu 0x36(%rsi), %xmm0 + vmovdqu 0x38(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x60(%rdi) + vmovdqu 0x48(%rsi), %xmm0 + vmovdqu 0x4a(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x80(%rdi) + vmovdqu 0x5a(%rsi), %xmm0 + vmovdqu 0x5c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xa0(%rdi) + vmovdqu 0x6c(%rsi), %xmm0 + vmovdqu 0x6e(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xc0(%rdi) + vmovdqu 0x7e(%rsi), %xmm0 + vmovdqu 0x80(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xe0(%rdi) + vmovdqu 0x90(%rsi), %xmm0 + vmovdqu 0x92(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x100(%rdi) + vmovdqu 0xa2(%rsi), %xmm0 + vmovdqu 0xa4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x120(%rdi) + vmovdqu 0xb4(%rsi), %xmm0 + vmovdqu 0xb6(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x140(%rdi) + vmovdqu 0xc6(%rsi), %xmm0 + vmovdqu 0xc8(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x160(%rdi) + vmovdqu 0xd8(%rsi), %xmm0 + vmovdqu 0xda(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x180(%rdi) + vmovdqu 0xea(%rsi), %xmm0 + vmovdqu 0xec(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1a0(%rdi) + vmovdqu 0xfc(%rsi), %xmm0 + vmovdqu 0xfe(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1c0(%rdi) + vmovdqu 0x10e(%rsi), %xmm0 + vmovdqu 0x110(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1e0(%rdi) + vmovdqu 0x120(%rsi), %xmm0 + vmovdqu 0x122(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x200(%rdi) + vmovdqu 0x132(%rsi), %xmm0 + vmovdqu 0x134(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x220(%rdi) + vmovdqu 0x144(%rsi), %xmm0 + vmovdqu 0x146(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x240(%rdi) + vmovdqu 0x156(%rsi), %xmm0 + vmovdqu 0x158(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x260(%rdi) + vmovdqu 0x168(%rsi), %xmm0 + vmovdqu 0x16a(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x280(%rdi) + vmovdqu 0x17a(%rsi), %xmm0 + vmovdqu 0x17c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2a0(%rdi) + vmovdqu 0x18c(%rsi), %xmm0 + vmovdqu 0x18e(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2c0(%rdi) + vmovdqu 0x19e(%rsi), %xmm0 + vmovdqu 0x1a0(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2e0(%rdi) + vmovdqu 0x1b0(%rsi), %xmm0 + vmovdqu 0x1b2(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x300(%rdi) + vmovdqu 0x1c2(%rsi), %xmm0 + vmovdqu 0x1c4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x320(%rdi) + vmovdqu 0x1d4(%rsi), %xmm0 + vmovdqu 0x1d6(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x340(%rdi) + vmovdqu 0x1e6(%rsi), %xmm0 + vmovdqu 0x1e8(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x360(%rdi) + vmovdqu 0x1f8(%rsi), %xmm0 + vmovdqu 0x1fa(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x380(%rdi) + vmovdqu 0x20a(%rsi), %xmm0 + vmovdqu 0x20c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3a0(%rdi) + vmovdqu 0x21c(%rsi), %xmm0 + vmovdqu 0x21e(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3c0(%rdi) + vmovdqu 0x22e(%rsi), %xmm0 + vmovdqu 0x230(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3e0(%rdi) + retq + .cfi_endproc + +MLD_ASM_FN_SIZE(polyz_unpack_17_avx2_asm) + + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ + !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \ + */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c b/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c deleted file mode 100644 index fe5c3895b..000000000 --- a/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright (c) The mldsa-native project authors - * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - */ - -/* References - * ========== - * - * - [REF_AVX2] - * CRYSTALS-Dilithium optimized AVX2 implementation - * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé - * https://github.com/pq-crystals/dilithium/tree/master/avx2 - */ - -/* - * This file is derived from the public domain - * AVX2 Dilithium implementation @[REF_AVX2]. - */ - -#include "../../../common.h" - -#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ - (!defined(MLD_CONFIG_NO_SIGN_API) || \ - !defined(MLD_CONFIG_NO_VERIFY_API)) && \ - !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ - (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ - (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) - -#include -#include "arith_native_x86_64.h" - -void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a) -{ - unsigned int i; - __m256i f; - __m128i low, high; - - const __m256i shufbidx = _mm256_set_epi8( - -1, 31, 30, 29, -1, 29, 28, 27, -1, 26, 25, 24, -1, 24, 23, 22, -1, 9, 8, - 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0); - /* Equivalent to _mm256_set_epi32(4, 0, 4, 0, 4, 0, 4, 0) */ - const __m256i srlvdidx = _mm256_set1_epi64x((uint64_t)4 << 32); - const __m256i mask = _mm256_set1_epi32(0xFFFFF); - const __m256i gamma1 = _mm256_set1_epi32((1 << 19)); - - for (i = 0; i < MLDSA_N / 8; i++) - { - /* Load bytes 0..15 into low 128-bit vector */ - low = _mm_loadu_si128((__m128i *)&a[20 * i]); - /* Load bytes 4..19 into high 128-bit vector */ - high = _mm_loadu_si128((__m128i *)&a[20 * i + 4]); - /* Combine into 256-bit vector */ - f = _mm256_inserti128_si256(_mm256_castsi128_si256(low), high, 1); - - /* Shuffling 8-bit lanes - * - * ┌─ Indices 0-9 into low 128-bit half ───────────────────────────────────┐ - * │ Shuffle: [-1, 9, 8, 7, -1, 7, 6, 5, -1, 4, 3, 2, -1, 2, 1, 0] │ - * │ Result: [0, byte9, byte8, byte7, ..., 0, byte2, byte1, byte0] │ - * └───────────────────────────────────────────────────────────────────────┘ - * - * ┌─ Indices 16-31 into high 128-bit half ────────────────────────────────┐ - * │ Shuffle: [-1,31, 30, 29, -1,29, 28, 27, -1,26, 25, 24, -1,24, 23, 22] │ - * │ Result: [0, byte19, byte18, byte17, ..., 0, byte12, byte11, byte10] │ - * └───────────────────────────────────────────────────────────────────────┘ - */ - f = _mm256_shuffle_epi8(f, shufbidx); - - /* Keep only 20 out of 24 bits in each 32-bit lane */ - /* Bits 0..23 16..39 40..63 56..79 - * 80..103 96..119 120..143 136..159 */ - f = _mm256_srlv_epi32(f, srlvdidx); - /* Bits 0..23 20..39 40..63 60..79 - * 80..103 100..119 120..143 140..159 */ - f = _mm256_and_si256(f, mask); - /* Bits 0..19 20..39 40..59 60..79 - * 80..99 100..119 120..139 140..159 */ - - /* Map [0, 1, ..., 2^20-1] to [2^19, 2^19-1, ..., -2^19+1] */ - f = _mm256_sub_epi32(gamma1, f); - - _mm256_store_si256((__m256i *)&r[8 * i], f); - } -} - -#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ - !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ - || MLD_CONFIG_PARAMETER_SET == 87) */ - -MLD_EMPTY_CU(avx2_polyz_unpack_19) - -#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ - !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ - (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ - || MLD_CONFIG_PARAMETER_SET == 87)) */ diff --git a/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2_asm.S b/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2_asm.S new file mode 100644 index 000000000..f247ced81 --- /dev/null +++ b/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2_asm.S @@ -0,0 +1,342 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_polyz_unpack_19_avx2_asm + * + * Description: Unpack polynomial z with 20-bit packed coefficients + * (GAMMA1 = 2^19). Maps packed [0, 2^20-1] to signed + * [-(2^19-1), 2^19] via GAMMA1 - x. + * + * Arguments: - int32_t *r: pointer to output polynomial (1024 bytes) + * - const uint8_t *a: pointer to packed input (640 bytes) + **************************************************/ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + (!defined(MLD_CONFIG_NO_SIGN_API) || \ + !defined(MLD_CONFIG_NO_VERIFY_API)) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \ + (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \ + (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87)) + + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/polyz_unpack_19_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +.global MLD_ASM_NAMESPACE(polyz_unpack_19_avx2_asm) +MLD_ASM_FN_SYMBOL(polyz_unpack_19_avx2_asm) + + .cfi_startproc + movabsq $-0xfbfcfd00fdff00, %rax # imm = 0xFF040302FF020100 + vmovq %rax, %xmm1 + movabsq $-0xf6f7f800f8f9fb, %rax # imm = 0xFF090807FF070605 + vpinsrq $0x1, %rax, %xmm1, %xmm1 + movabsq $-0xe5e6e700e7e8ea, %rax # imm = 0xFF1A1918FF181716 + vmovq %rax, %xmm5 + movabsq $-0xe0e1e200e2e3e5, %rax # imm = 0xFF1F1E1DFF1D1C1B + vpinsrq $0x1, %rax, %xmm5, %xmm5 + vinserti128 $0x1, %xmm5, %ymm1, %ymm1 + movabsq $0x400000000, %rax # imm = 0x400000000 + vmovq %rax, %xmm2 + movabsq $0x400000000, %rax # imm = 0x400000000 + vpinsrq $0x1, %rax, %xmm2, %xmm2 + vinserti128 $0x1, %xmm2, %ymm2, %ymm2 + movl $0xfffff, %eax # imm = 0xFFFFF + vmovd %eax, %xmm3 + vpbroadcastd %xmm3, %ymm3 + movl $0x80000, %eax # imm = 0x80000 + vmovd %eax, %xmm4 + vpbroadcastd %xmm4, %ymm4 + vmovdqu (%rsi), %xmm0 + vmovdqu 0x4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, (%rdi) + vmovdqu 0x14(%rsi), %xmm0 + vmovdqu 0x18(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x20(%rdi) + vmovdqu 0x28(%rsi), %xmm0 + vmovdqu 0x2c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x40(%rdi) + vmovdqu 0x3c(%rsi), %xmm0 + vmovdqu 0x40(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x60(%rdi) + vmovdqu 0x50(%rsi), %xmm0 + vmovdqu 0x54(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x80(%rdi) + vmovdqu 0x64(%rsi), %xmm0 + vmovdqu 0x68(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xa0(%rdi) + vmovdqu 0x78(%rsi), %xmm0 + vmovdqu 0x7c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xc0(%rdi) + vmovdqu 0x8c(%rsi), %xmm0 + vmovdqu 0x90(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xe0(%rdi) + vmovdqu 0xa0(%rsi), %xmm0 + vmovdqu 0xa4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x100(%rdi) + vmovdqu 0xb4(%rsi), %xmm0 + vmovdqu 0xb8(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x120(%rdi) + vmovdqu 0xc8(%rsi), %xmm0 + vmovdqu 0xcc(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x140(%rdi) + vmovdqu 0xdc(%rsi), %xmm0 + vmovdqu 0xe0(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x160(%rdi) + vmovdqu 0xf0(%rsi), %xmm0 + vmovdqu 0xf4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x180(%rdi) + vmovdqu 0x104(%rsi), %xmm0 + vmovdqu 0x108(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1a0(%rdi) + vmovdqu 0x118(%rsi), %xmm0 + vmovdqu 0x11c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1c0(%rdi) + vmovdqu 0x12c(%rsi), %xmm0 + vmovdqu 0x130(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1e0(%rdi) + vmovdqu 0x140(%rsi), %xmm0 + vmovdqu 0x144(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x200(%rdi) + vmovdqu 0x154(%rsi), %xmm0 + vmovdqu 0x158(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x220(%rdi) + vmovdqu 0x168(%rsi), %xmm0 + vmovdqu 0x16c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x240(%rdi) + vmovdqu 0x17c(%rsi), %xmm0 + vmovdqu 0x180(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x260(%rdi) + vmovdqu 0x190(%rsi), %xmm0 + vmovdqu 0x194(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x280(%rdi) + vmovdqu 0x1a4(%rsi), %xmm0 + vmovdqu 0x1a8(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2a0(%rdi) + vmovdqu 0x1b8(%rsi), %xmm0 + vmovdqu 0x1bc(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2c0(%rdi) + vmovdqu 0x1cc(%rsi), %xmm0 + vmovdqu 0x1d0(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2e0(%rdi) + vmovdqu 0x1e0(%rsi), %xmm0 + vmovdqu 0x1e4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x300(%rdi) + vmovdqu 0x1f4(%rsi), %xmm0 + vmovdqu 0x1f8(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x320(%rdi) + vmovdqu 0x208(%rsi), %xmm0 + vmovdqu 0x20c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x340(%rdi) + vmovdqu 0x21c(%rsi), %xmm0 + vmovdqu 0x220(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x360(%rdi) + vmovdqu 0x230(%rsi), %xmm0 + vmovdqu 0x234(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x380(%rdi) + vmovdqu 0x244(%rsi), %xmm0 + vmovdqu 0x248(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3a0(%rdi) + vmovdqu 0x258(%rsi), %xmm0 + vmovdqu 0x25c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3c0(%rdi) + vmovdqu 0x26c(%rsi), %xmm0 + vmovdqu 0x270(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3e0(%rdi) + retq + .cfi_endproc + +MLD_ASM_FN_SIZE(polyz_unpack_19_avx2_asm) + + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \ + !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \ + (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \ + || MLD_CONFIG_PARAMETER_SET == 87) */ + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/proofs/cbmc/polyz_unpack_native_x86_64/Makefile b/proofs/cbmc/polyz_unpack_native_x86_64/Makefile new file mode 100644 index 000000000..eb616114d --- /dev/null +++ b/proofs/cbmc/polyz_unpack_native_x86_64/Makefile @@ -0,0 +1,49 @@ +# Copyright (c) The mldsa-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +include ../Makefile_params.common + +HARNESS_ENTRY = harness +HARNESS_FILE = polyz_unpack_native_x86_64_harness + +# This should be a unique identifier for this proof, and will appear on the +# Litani dashboard. It can be human-readable and contain spaces if you wish. +PROOF_UID = polyz_unpack_native_x86_64 + +# We need to set MLD_CHECK_APIS as otherwise mldsa/src/native/api.h won't be +# included, which contains the CBMC specifications. +DEFINES += -DMLD_CONFIG_USE_NATIVE_BACKEND_ARITH -DMLD_CONFIG_ARITH_BACKEND_FILE="\"$(SRCDIR)/mldsa/src/native/x86_64/meta.h\"" -DMLD_CHECK_APIS +INCLUDES += + +REMOVE_FUNCTION_BODY += +UNWINDSET += + +PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c +PROJECT_SOURCES += $(SRCDIR)/mldsa/src/poly_kl.c + +# polyz_unpack_17 is used with ML-DSA-44 (GAMMA1 = 2^17); +# polyz_unpack_19 with ML-DSA-65 and ML-DSA-87 (GAMMA1 = 2^19). +ifeq ($(MLD_CONFIG_PARAMETER_SET),44) + CHECK_FUNCTION_CONTRACTS=mld_polyz_unpack_17_native + USE_FUNCTION_CONTRACTS=mld_polyz_unpack_17_avx2_asm +else ifeq ($(MLD_CONFIG_PARAMETER_SET),65) + CHECK_FUNCTION_CONTRACTS=mld_polyz_unpack_19_native + USE_FUNCTION_CONTRACTS=mld_polyz_unpack_19_avx2_asm +else ifeq ($(MLD_CONFIG_PARAMETER_SET),87) + CHECK_FUNCTION_CONTRACTS=mld_polyz_unpack_19_native + USE_FUNCTION_CONTRACTS=mld_polyz_unpack_19_avx2_asm +endif +USE_FUNCTION_CONTRACTS+=mld_sys_check_capability +APPLY_LOOP_CONTRACTS=on +USE_DYNAMIC_FRAMES=1 + +# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead +EXTERNAL_SAT_SOLVER= +CBMCFLAGS=--smt2 + +FUNCTION_NAME = polyz_unpack_native_x86_64 + +# This function is large enough to need... +CBMC_OBJECT_BITS = 8 + +include ../Makefile.common diff --git a/proofs/cbmc/polyz_unpack_native_x86_64/polyz_unpack_native_x86_64_harness.c b/proofs/cbmc/polyz_unpack_native_x86_64/polyz_unpack_native_x86_64_harness.c new file mode 100644 index 000000000..8e143d315 --- /dev/null +++ b/proofs/cbmc/polyz_unpack_native_x86_64/polyz_unpack_native_x86_64_harness.c @@ -0,0 +1,25 @@ +// Copyright (c) The mldsa-native project authors +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +#include +#include "cbmc.h" +#include "params.h" + +#if MLDSA_GAMMA1 == (1 << 17) +int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a); +#else +int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a); +#endif + +void harness(void) +{ + int32_t *r; + const uint8_t *a; + int t; + +#if MLDSA_GAMMA1 == (1 << 17) + t = mld_polyz_unpack_17_native(r, a); +#else + t = mld_polyz_unpack_19_native(r, a); +#endif +} diff --git a/proofs/hol_light/README.md b/proofs/hol_light/README.md index 9bc9ea8bb..606c4e657 100644 --- a/proofs/hol_light/README.md +++ b/proofs/hol_light/README.md @@ -168,6 +168,8 @@ All routines listed below have been proven correct, memory-safe, and secret-inde * x86_64 pointwise multiplication-accumulation (l=5): [pointwise_acc_l5_avx2_asm.S](x86_64/mldsa/pointwise_acc_l5_avx2_asm.S) * x86_64 pointwise multiplication-accumulation (l=7): [pointwise_acc_l7_avx2_asm.S](x86_64/mldsa/pointwise_acc_l7_avx2_asm.S) * x86_64 poly_chknorm: [poly_chknorm_avx2_asm.S](x86_64/mldsa/poly_chknorm_avx2_asm.S) + * x86_64 polyz_unpack (l=4): [polyz_unpack_17_avx2_asm.S](x86_64/mldsa/polyz_unpack_17_avx2_asm.S) + * x86_64 polyz_unpack (l=5,7): [polyz_unpack_19_avx2_asm.S](x86_64/mldsa/polyz_unpack_19_avx2_asm.S) - FIPS202: * 4-fold Keccak-F1600 using AVX2: [keccak_f1600_x4_avx2_asm.S](x86_64/mldsa/keccak_f1600_x4_avx2_asm.S) diff --git a/proofs/hol_light/x86_64/Makefile b/proofs/hol_light/x86_64/Makefile index f5e816ad9..693078496 100644 --- a/proofs/hol_light/x86_64/Makefile +++ b/proofs/hol_light/x86_64/Makefile @@ -55,6 +55,8 @@ OBJ = mldsa/ntt_avx2_asm.o \ mldsa/nttunpack_avx2_asm.o \ mldsa/poly_caddq_avx2_asm.o \ mldsa/poly_chknorm_avx2_asm.o \ + mldsa/polyz_unpack_17_avx2_asm.o \ + mldsa/polyz_unpack_19_avx2_asm.o \ mldsa/pointwise_avx2_asm.o \ mldsa/pointwise_acc_l4_avx2_asm.o \ mldsa/pointwise_acc_l5_avx2_asm.o \ diff --git a/proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S b/proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S new file mode 100644 index 000000000..2bb921f9e --- /dev/null +++ b/proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S @@ -0,0 +1,333 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_polyz_unpack_17_avx2_asm + * + * Description: Unpack polynomial z with 18-bit packed coefficients + * (GAMMA1 = 2^17). Maps packed [0, 2^18-1] to signed + * [-(2^17-1), 2^17] via GAMMA1 - x. + * + * Arguments: - int32_t *r: pointer to output polynomial (1024 bytes) + * - const uint8_t *a: pointer to packed input (576 bytes) + **************************************************/ + + + + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/polyz_unpack_17_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +#ifdef __APPLE__ +.global _mld_polyz_unpack_17_avx2_asm +_mld_polyz_unpack_17_avx2_asm: +#else +.global mld_polyz_unpack_17_avx2_asm +mld_polyz_unpack_17_avx2_asm: +#endif + + .cfi_startproc + endbr64 + movabsq $-0xfbfcfd00fdff00, %rax # imm = 0xFF040302FF020100 + vmovq %rax, %xmm1 + movabsq $-0xf7f8f900f9fafc, %rax # imm = 0xFF080706FF060504 + vpinsrq $0x1, %rax, %xmm1, %xmm1 + movabsq $-0xe4e5e600e6e7e9, %rax # imm = 0xFF1B1A19FF191817 + vmovq %rax, %xmm5 + movabsq $-0xe0e1e200e2e3e5, %rax # imm = 0xFF1F1E1DFF1D1C1B + vpinsrq $0x1, %rax, %xmm5, %xmm5 + vinserti128 $0x1, %xmm5, %ymm1, %ymm1 + movabsq $0x200000000, %rax # imm = 0x200000000 + vmovq %rax, %xmm2 + movabsq $0x600000004, %rax # imm = 0x600000004 + vpinsrq $0x1, %rax, %xmm2, %xmm2 + vinserti128 $0x1, %xmm2, %ymm2, %ymm2 + movl $0x3ffff, %eax # imm = 0x3FFFF + vmovd %eax, %xmm3 + vpbroadcastd %xmm3, %ymm3 + movl $0x20000, %eax # imm = 0x20000 + vmovd %eax, %xmm4 + vpbroadcastd %xmm4, %ymm4 + vmovdqu (%rsi), %xmm0 + vmovdqu 0x2(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, (%rdi) + vmovdqu 0x12(%rsi), %xmm0 + vmovdqu 0x14(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x20(%rdi) + vmovdqu 0x24(%rsi), %xmm0 + vmovdqu 0x26(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x40(%rdi) + vmovdqu 0x36(%rsi), %xmm0 + vmovdqu 0x38(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x60(%rdi) + vmovdqu 0x48(%rsi), %xmm0 + vmovdqu 0x4a(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x80(%rdi) + vmovdqu 0x5a(%rsi), %xmm0 + vmovdqu 0x5c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xa0(%rdi) + vmovdqu 0x6c(%rsi), %xmm0 + vmovdqu 0x6e(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xc0(%rdi) + vmovdqu 0x7e(%rsi), %xmm0 + vmovdqu 0x80(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xe0(%rdi) + vmovdqu 0x90(%rsi), %xmm0 + vmovdqu 0x92(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x100(%rdi) + vmovdqu 0xa2(%rsi), %xmm0 + vmovdqu 0xa4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x120(%rdi) + vmovdqu 0xb4(%rsi), %xmm0 + vmovdqu 0xb6(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x140(%rdi) + vmovdqu 0xc6(%rsi), %xmm0 + vmovdqu 0xc8(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x160(%rdi) + vmovdqu 0xd8(%rsi), %xmm0 + vmovdqu 0xda(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x180(%rdi) + vmovdqu 0xea(%rsi), %xmm0 + vmovdqu 0xec(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1a0(%rdi) + vmovdqu 0xfc(%rsi), %xmm0 + vmovdqu 0xfe(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1c0(%rdi) + vmovdqu 0x10e(%rsi), %xmm0 + vmovdqu 0x110(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1e0(%rdi) + vmovdqu 0x120(%rsi), %xmm0 + vmovdqu 0x122(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x200(%rdi) + vmovdqu 0x132(%rsi), %xmm0 + vmovdqu 0x134(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x220(%rdi) + vmovdqu 0x144(%rsi), %xmm0 + vmovdqu 0x146(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x240(%rdi) + vmovdqu 0x156(%rsi), %xmm0 + vmovdqu 0x158(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x260(%rdi) + vmovdqu 0x168(%rsi), %xmm0 + vmovdqu 0x16a(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x280(%rdi) + vmovdqu 0x17a(%rsi), %xmm0 + vmovdqu 0x17c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2a0(%rdi) + vmovdqu 0x18c(%rsi), %xmm0 + vmovdqu 0x18e(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2c0(%rdi) + vmovdqu 0x19e(%rsi), %xmm0 + vmovdqu 0x1a0(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2e0(%rdi) + vmovdqu 0x1b0(%rsi), %xmm0 + vmovdqu 0x1b2(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x300(%rdi) + vmovdqu 0x1c2(%rsi), %xmm0 + vmovdqu 0x1c4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x320(%rdi) + vmovdqu 0x1d4(%rsi), %xmm0 + vmovdqu 0x1d6(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x340(%rdi) + vmovdqu 0x1e6(%rsi), %xmm0 + vmovdqu 0x1e8(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x360(%rdi) + vmovdqu 0x1f8(%rsi), %xmm0 + vmovdqu 0x1fa(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x380(%rdi) + vmovdqu 0x20a(%rsi), %xmm0 + vmovdqu 0x20c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3a0(%rdi) + vmovdqu 0x21c(%rsi), %xmm0 + vmovdqu 0x21e(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3c0(%rdi) + vmovdqu 0x22e(%rsi), %xmm0 + vmovdqu 0x230(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3e0(%rdi) + retq + .cfi_endproc + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S b/proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S new file mode 100644 index 000000000..999977f1c --- /dev/null +++ b/proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S @@ -0,0 +1,333 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* References + * ========== + * + * - [REF_AVX2] + * CRYSTALS-Dilithium optimized AVX2 implementation + * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé + * https://github.com/pq-crystals/dilithium/tree/master/avx2 + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + + +/************************************************* + * Name: mld_polyz_unpack_19_avx2_asm + * + * Description: Unpack polynomial z with 20-bit packed coefficients + * (GAMMA1 = 2^19). Maps packed [0, 2^20-1] to signed + * [-(2^19-1), 2^19] via GAMMA1 - x. + * + * Arguments: - int32_t *r: pointer to output polynomial (1024 bytes) + * - const uint8_t *a: pointer to packed input (640 bytes) + **************************************************/ + + + + +/* + * WARNING: This file is auto-derived from the mldsa-native source file + * dev/x86_64/src/polyz_unpack_19_avx2_asm.S using scripts/simpasm. Do not modify it directly. + */ + +.text +.balign 4 +#ifdef __APPLE__ +.global _mld_polyz_unpack_19_avx2_asm +_mld_polyz_unpack_19_avx2_asm: +#else +.global mld_polyz_unpack_19_avx2_asm +mld_polyz_unpack_19_avx2_asm: +#endif + + .cfi_startproc + endbr64 + movabsq $-0xfbfcfd00fdff00, %rax # imm = 0xFF040302FF020100 + vmovq %rax, %xmm1 + movabsq $-0xf6f7f800f8f9fb, %rax # imm = 0xFF090807FF070605 + vpinsrq $0x1, %rax, %xmm1, %xmm1 + movabsq $-0xe5e6e700e7e8ea, %rax # imm = 0xFF1A1918FF181716 + vmovq %rax, %xmm5 + movabsq $-0xe0e1e200e2e3e5, %rax # imm = 0xFF1F1E1DFF1D1C1B + vpinsrq $0x1, %rax, %xmm5, %xmm5 + vinserti128 $0x1, %xmm5, %ymm1, %ymm1 + movabsq $0x400000000, %rax # imm = 0x400000000 + vmovq %rax, %xmm2 + movabsq $0x400000000, %rax # imm = 0x400000000 + vpinsrq $0x1, %rax, %xmm2, %xmm2 + vinserti128 $0x1, %xmm2, %ymm2, %ymm2 + movl $0xfffff, %eax # imm = 0xFFFFF + vmovd %eax, %xmm3 + vpbroadcastd %xmm3, %ymm3 + movl $0x80000, %eax # imm = 0x80000 + vmovd %eax, %xmm4 + vpbroadcastd %xmm4, %ymm4 + vmovdqu (%rsi), %xmm0 + vmovdqu 0x4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, (%rdi) + vmovdqu 0x14(%rsi), %xmm0 + vmovdqu 0x18(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x20(%rdi) + vmovdqu 0x28(%rsi), %xmm0 + vmovdqu 0x2c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x40(%rdi) + vmovdqu 0x3c(%rsi), %xmm0 + vmovdqu 0x40(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x60(%rdi) + vmovdqu 0x50(%rsi), %xmm0 + vmovdqu 0x54(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x80(%rdi) + vmovdqu 0x64(%rsi), %xmm0 + vmovdqu 0x68(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xa0(%rdi) + vmovdqu 0x78(%rsi), %xmm0 + vmovdqu 0x7c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xc0(%rdi) + vmovdqu 0x8c(%rsi), %xmm0 + vmovdqu 0x90(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0xe0(%rdi) + vmovdqu 0xa0(%rsi), %xmm0 + vmovdqu 0xa4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x100(%rdi) + vmovdqu 0xb4(%rsi), %xmm0 + vmovdqu 0xb8(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x120(%rdi) + vmovdqu 0xc8(%rsi), %xmm0 + vmovdqu 0xcc(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x140(%rdi) + vmovdqu 0xdc(%rsi), %xmm0 + vmovdqu 0xe0(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x160(%rdi) + vmovdqu 0xf0(%rsi), %xmm0 + vmovdqu 0xf4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x180(%rdi) + vmovdqu 0x104(%rsi), %xmm0 + vmovdqu 0x108(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1a0(%rdi) + vmovdqu 0x118(%rsi), %xmm0 + vmovdqu 0x11c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1c0(%rdi) + vmovdqu 0x12c(%rsi), %xmm0 + vmovdqu 0x130(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x1e0(%rdi) + vmovdqu 0x140(%rsi), %xmm0 + vmovdqu 0x144(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x200(%rdi) + vmovdqu 0x154(%rsi), %xmm0 + vmovdqu 0x158(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x220(%rdi) + vmovdqu 0x168(%rsi), %xmm0 + vmovdqu 0x16c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x240(%rdi) + vmovdqu 0x17c(%rsi), %xmm0 + vmovdqu 0x180(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x260(%rdi) + vmovdqu 0x190(%rsi), %xmm0 + vmovdqu 0x194(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x280(%rdi) + vmovdqu 0x1a4(%rsi), %xmm0 + vmovdqu 0x1a8(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2a0(%rdi) + vmovdqu 0x1b8(%rsi), %xmm0 + vmovdqu 0x1bc(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2c0(%rdi) + vmovdqu 0x1cc(%rsi), %xmm0 + vmovdqu 0x1d0(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x2e0(%rdi) + vmovdqu 0x1e0(%rsi), %xmm0 + vmovdqu 0x1e4(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x300(%rdi) + vmovdqu 0x1f4(%rsi), %xmm0 + vmovdqu 0x1f8(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x320(%rdi) + vmovdqu 0x208(%rsi), %xmm0 + vmovdqu 0x20c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x340(%rdi) + vmovdqu 0x21c(%rsi), %xmm0 + vmovdqu 0x220(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x360(%rdi) + vmovdqu 0x230(%rsi), %xmm0 + vmovdqu 0x234(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x380(%rdi) + vmovdqu 0x244(%rsi), %xmm0 + vmovdqu 0x248(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3a0(%rdi) + vmovdqu 0x258(%rsi), %xmm0 + vmovdqu 0x25c(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3c0(%rdi) + vmovdqu 0x26c(%rsi), %xmm0 + vmovdqu 0x270(%rsi), %xmm5 + vinserti128 $0x1, %xmm5, %ymm0, %ymm0 + vpshufb %ymm1, %ymm0, %ymm0 + vpsrlvd %ymm2, %ymm0, %ymm0 + vpand %ymm3, %ymm0, %ymm0 + vpsubd %ymm0, %ymm4, %ymm0 + vmovdqa %ymm0, 0x3e0(%rdi) + retq + .cfi_endproc + +#if defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif diff --git a/proofs/hol_light/x86_64/proofs/dump_bytecode.ml b/proofs/hol_light/x86_64/proofs/dump_bytecode.ml index 82dd40f5a..d5b7ad12f 100644 --- a/proofs/hol_light/x86_64/proofs/dump_bytecode.ml +++ b/proofs/hol_light/x86_64/proofs/dump_bytecode.ml @@ -44,3 +44,11 @@ print_string "==== bytecode end =====================================\n\n";; print_string "=== bytecode start: x86_64/mldsa/poly_chknorm_avx2_asm.o ================\n";; print_literal_from_elf "x86_64/mldsa/poly_chknorm_avx2_asm.o";; print_string "==== bytecode end =====================================\n\n";; + +print_string "=== bytecode start: x86_64/mldsa/polyz_unpack_17_avx2_asm.o ================\n";; +print_literal_from_elf "x86_64/mldsa/polyz_unpack_17_avx2_asm.o";; +print_string "==== bytecode end =====================================\n\n";; + +print_string "=== bytecode start: x86_64/mldsa/polyz_unpack_19_avx2_asm.o ================\n";; +print_literal_from_elf "x86_64/mldsa/polyz_unpack_19_avx2_asm.o";; +print_string "==== bytecode end =====================================\n\n";; diff --git a/proofs/hol_light/x86_64/proofs/mldsa_utils.ml b/proofs/hol_light/x86_64/proofs/mldsa_utils.ml index f52935275..88b6bf6ac 100644 --- a/proofs/hol_light/x86_64/proofs/mldsa_utils.ml +++ b/proofs/hol_light/x86_64/proofs/mldsa_utils.ml @@ -75,3 +75,91 @@ let MAP_UNTIL_TARGET_PC f n = fun (asl, w) -> (TARGET_PC_REACHED_TAC target_pc ORELSE (f n THEN core (n + 1))) (asl, w) in core n (asl, w);; + +(* ------------------------------------------------------------------------- *) +(* Coefficient (un)packing helpers shared across the polyz_unpack proofs. *) +(* ------------------------------------------------------------------------- *) + +(* Split ncoeffs d-bit coefficients into chunks of chunk_size. *) +let mk_split_theorem d ncoeffs chunk_size = + let total = d * chunk_size in + let nchunks = ncoeffs / chunk_size in + let d_ty = mk_finty (Num.num_of_int d) in + let total_ty = mk_finty (Num.num_of_int total) in + prove( + subst [mk_small_numeral ncoeffs, `ncoeffs:num`; + mk_small_numeral chunk_size, `cs:num`; + mk_small_numeral nchunks, `nc:num`] + (inst [d_ty, `:D`; total_ty, `:T`] + `!(l: (D word) list). LENGTH l = ncoeffs ==> + num_of_wordlist l = num_of_wordlist (MAP ((word:num->T word) o num_of_wordlist) + (list_of_seq (\i. SUB_LIST (cs * i, cs) l) nc))`), + REPEAT STRIP_TAC THEN + UNDISCH_THEN (subst [mk_small_numeral ncoeffs, `n:num`] + (inst [d_ty, `:D`] `LENGTH (l : (D word) list) = n`)) (fun th -> + GEN_REWRITE_TAC (LAND_CONV o ONCE_DEPTH_CONV) + [MATCH_MP (CONV_RULE NUM_REDUCE_CONV + (ISPECL [mk_small_numeral chunk_size; mk_small_numeral nchunks; + `l:'a list`] SUBLIST_PARTITION)) th] + THEN ASSUME_TAC th) THEN + IMP_REWRITE_TAC [CONV_RULE (ONCE_DEPTH_CONV DIMINDEX_CONV THENC NUM_REDUCE_CONV) + (ISPECL [inst [d_ty, `:D`] `ll: ((D word) list) list`; + mk_small_numeral chunk_size] + (INST_TYPE [d_ty, `:N`; total_ty, `:M`] NUM_OF_WORDLIST_FLATTEN))] THEN + CONV_TAC(ONCE_DEPTH_CONV LIST_OF_SEQ_CONV) THEN + ASM_REWRITE_TAC[ALL; LENGTH_SUB_LIST] THEN + ARITH_TAC);; + +(* Extract individual d-bit coefficients from a (d*chunk_size)-bit word. *) +let mk_subword_cases d chunk_size = + let total = d * chunk_size in + let d_ty = mk_finty (Num.num_of_int d) in + let total_ty = mk_finty (Num.num_of_int total) in + let arith_simp = + let lhs = mk_eq(mk_small_numeral total, + mk_comb(mk_comb(`( * ):num->num->num`, + mk_small_numeral d), `n:num`)) in + let rhs = mk_eq(`n:num`, mk_small_numeral chunk_size) in + ARITH_RULE (mk_eq(lhs, rhs)) in + let meson_simp = + let n_eq = mk_eq(`n:num`, mk_small_numeral chunk_size) in + let k_lt_n = mk_comb(mk_comb(`(<):num->num->bool`, `k:num`), `n:num`) in + let k_lt_cs = mk_comb(mk_comb(`(<):num->num->bool`, `k:num`), + mk_small_numeral chunk_size) in + MESON[] (mk_eq(mk_conj(n_eq, k_lt_n), mk_conj(n_eq, k_lt_cs))) in + let base = + let th = INST_TYPE [total_ty, `:KL`; d_ty, `:L`] WORD_SUBWORD_NUM_OF_WORDLIST in + let th = CONV_RULE(DEPTH_CONV DIMINDEX_CONV) th in + REWRITE_RULE[arith_simp; meson_simp] th in + let mk k = + let th = SPEC (mk_small_numeral k) + (SPEC (inst [d_ty, `:L`] `ls:(L word)list`) base) in + CONV_RULE NUM_REDUCE_CONV (REWRITE_RULE[ARITH] th) in + map mk (0 -- (chunk_size - 1));; + +(* Split a 256-element 32-bit-word list into 32 chunks of 8 (256-bit words), + used to express the output spec as 32 store-sized pieces. *) +let NUM_OF_WORDLIST_SPLIT_32_256_8 = prove + (`!(L:(32 word) list). LENGTH L = 256 ==> num_of_wordlist L = + num_of_wordlist (MAP ((word:num->256 word) o num_of_wordlist) + (list_of_seq (\i. SUB_LIST(8*i,8) L) 32))`, + REPEAT STRIP_TAC THEN + UNDISCH_THEN `LENGTH(L:(32 word)list)=256` (fun th -> + GEN_REWRITE_TAC (LAND_CONV o ONCE_DEPTH_CONV) + [MATCH_MP (CONV_RULE NUM_REDUCE_CONV + (ISPECL [`8`;`32`;`L:(32 word)list`] SUBLIST_PARTITION)) th] THEN ASSUME_TAC th) THEN + IMP_REWRITE_TAC[CONV_RULE(ONCE_DEPTH_CONV DIMINDEX_CONV THENC NUM_REDUCE_CONV) + (ISPECL [`ll:((32 word)list)list`;`8`] (INST_TYPE[`:32`,`:N`;`:256`,`:M`] NUM_OF_WORDLIST_FLATTEN))] THEN + CONV_TAC(ONCE_DEPTH_CONV LIST_OF_SEQ_CONV) THEN + ASM_REWRITE_TAC[ALL;LENGTH_SUB_LIST] THEN ARITH_TAC);; + +(* MAP commutes with SUB_LIST. *) +let MAP_SUB_LIST = prove + (`!(f:A->B) p q l. MAP f (SUB_LIST(p,q) l) = SUB_LIST(p,q) (MAP f l)`, + GEN_TAC THEN + ONCE_REWRITE_TAC[MESON[] `(!p q l. P p q l) <=> (!l p q. P p q l)`] THEN + LIST_INDUCT_TAC THEN REWRITE_TAC[MAP; SUB_LIST_CLAUSES] THEN + REPEAT GEN_TAC THEN SPEC_TAC(`q:num`,`q:num`) THEN SPEC_TAC(`p:num`,`p:num`) THEN + MATCH_MP_TAC num_INDUCTION THEN ASM_REWRITE_TAC[SUB_LIST_CLAUSES; MAP] THEN + REPEAT STRIP_TAC THEN SPEC_TAC(`q:num`,`q:num`) THEN MATCH_MP_TAC num_INDUCTION THEN + ASM_REWRITE_TAC[SUB_LIST_CLAUSES; MAP]);; diff --git a/proofs/hol_light/x86_64/proofs/polyz_unpack_17_avx2_asm.ml b/proofs/hol_light/x86_64/proofs/polyz_unpack_17_avx2_asm.ml new file mode 100644 index 000000000..68d43b0e6 --- /dev/null +++ b/proofs/hol_light/x86_64/proofs/polyz_unpack_17_avx2_asm.ml @@ -0,0 +1,909 @@ +(* + * Copyright (c) The mldsa-native project authors + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Functional correctness of polyz_unpack_17 (x86_64 AVX2): *) +(* Unpack polynomial z with 18-bit packed coefficients (GAMMA1 = 2^17) *) +(* Maps packed [0, 2^18-1] to signed [-(2^17-1), 2^17] via GAMMA1 - x *) +(* (ML-DSA-44). *) +(* *) +(* The x86 routine builds the shuffle/shift/mask/gamma1 constants inline *) +(* (VMOVQ/VPINSRQ/VINSERTI128/VPBROADCASTD) and unpacks 8 coefficients per *) +(* block with VPSHUFB/VPSRLVD/VPAND/VPSUBD. *) +(* ========================================================================= *) + +needs "s2n_bignum/x86/proofs/base.ml";; +needs "mldsa_native/common/mldsa_specs.ml";; +needs "mldsa_native/x86_64/proofs/mldsa_utils.ml";; + +(**** print_literal_from_elf "x86_64/mldsa/polyz_unpack_17_avx2_asm.o";; + ****) + +let mldsa_polyz_unpack_17_mc = define_assert_from_elf + "mldsa_polyz_unpack_17_mc" "x86_64/mldsa/polyz_unpack_17_avx2_asm.o" +(*** BYTECODE START ***) +[ + 0xf3; 0x0f; 0x1e; 0xfa; (* ENDBR64 *) + 0x48; 0xb8; 0x00; 0x01; 0x02; 0xff; 0x02; 0x03; 0x04; 0xff; + (* MOV (% rax) (Imm64 (word 18375815690981605632)) *) + 0xc4; 0xe1; 0xf9; 0x6e; 0xc8; + (* VMOVQ (%_% xmm1) (% rax) *) + 0x48; 0xb8; 0x04; 0x05; 0x06; 0xff; 0x06; 0x07; 0x08; 0xff; + (* MOV (% rax) (Imm64 (word 18376946006115091716)) *) + 0xc4; 0xe3; 0xf1; 0x22; 0xc8; 0x01; + (* VPINSRQ (%_% xmm1) (%_% xmm1) (% rax) (Imm8 (word 1)) *) + 0x48; 0xb8; 0x17; 0x18; 0x19; 0xff; 0x19; 0x1a; 0x1b; 0xff; + (* MOV (% rax) (Imm64 (word 18382315002999150615)) *) + 0xc4; 0xe1; 0xf9; 0x6e; 0xe8; + (* VMOVQ (%_% xmm5) (% rax) *) + 0x48; 0xb8; 0x1b; 0x1c; 0x1d; 0xff; 0x1d; 0x1e; 0x1f; 0xff; + (* MOV (% rax) (Imm64 (word 18383445318132636699)) *) + 0xc4; 0xe3; 0xd1; 0x22; 0xe8; 0x01; + (* VPINSRQ (%_% xmm5) (%_% xmm5) (% rax) (Imm8 (word 1)) *) + 0xc4; 0xe3; 0x75; 0x38; 0xcd; 0x01; + (* VINSERTI128 (%_% ymm1) (%_% ymm1) (%_% xmm5) (Imm8 (word 1)) *) + 0x48; 0xb8; 0x00; 0x00; 0x00; 0x00; 0x02; 0x00; 0x00; 0x00; + (* MOV (% rax) (Imm64 (word 8589934592)) *) + 0xc4; 0xe1; 0xf9; 0x6e; 0xd0; + (* VMOVQ (%_% xmm2) (% rax) *) + 0x48; 0xb8; 0x04; 0x00; 0x00; 0x00; 0x06; 0x00; 0x00; 0x00; + (* MOV (% rax) (Imm64 (word 25769803780)) *) + 0xc4; 0xe3; 0xe9; 0x22; 0xd0; 0x01; + (* VPINSRQ (%_% xmm2) (%_% xmm2) (% rax) (Imm8 (word 1)) *) + 0xc4; 0xe3; 0x6d; 0x38; 0xd2; 0x01; + (* VINSERTI128 (%_% ymm2) (%_% ymm2) (%_% xmm2) (Imm8 (word 1)) *) + 0xb8; 0xff; 0xff; 0x03; 0x00; + (* MOV (% eax) (Imm32 (word 262143)) *) + 0xc5; 0xf9; 0x6e; 0xd8; (* VMOVD (%_% xmm3) (% eax) *) + 0xc4; 0xe2; 0x7d; 0x58; 0xdb; + (* VPBROADCASTD (%_% ymm3) (%_% xmm3) *) + 0xb8; 0x00; 0x00; 0x02; 0x00; + (* MOV (% eax) (Imm32 (word 131072)) *) + 0xc5; 0xf9; 0x6e; 0xe0; (* VMOVD (%_% xmm4) (% eax) *) + 0xc4; 0xe2; 0x7d; 0x58; 0xe4; + (* VPBROADCASTD (%_% ymm4) (%_% xmm4) *) + 0xc5; 0xfa; 0x6f; 0x06; (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,0))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x02; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,2))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x07; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x12; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,18))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x14; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,20))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x47; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x24; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,36))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x26; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,38))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x47; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x36; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,54))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x38; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,56))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x47; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x48; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,72))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x4a; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,74))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x5a; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,90))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x5c; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,92))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x6c; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,108))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x6e; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,110))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x7e; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,126))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,128))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x90; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,144))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x92; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,146))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xa2; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,162))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xa4; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,164))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xb4; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,180))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xb6; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,182))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xc6; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,198))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xc8; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,200))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xd8; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,216))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xda; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,218))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xea; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,234))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xec; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,236))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xfc; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,252))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xfe; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,254))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x0e; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,270))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x10; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,272))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,288))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x22; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,290))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,512))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x32; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,306))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x34; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,308))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,544))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x44; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,324))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x46; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,326))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,576))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x56; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,342))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x58; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,344))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,608))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x68; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,360))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x6a; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,362))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,640))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x7a; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,378))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x7c; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,380))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,672))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x8c; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,396))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x8e; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,398))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,704))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x9e; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,414))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,416))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,736))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xb0; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,432))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xb2; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,434))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,768))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xc2; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,450))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xc4; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,452))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,800))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xd4; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,468))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xd6; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,470))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,832))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xe6; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,486))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xe8; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,488))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,864))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xf8; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,504))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xfa; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,506))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,896))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x0a; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,522))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x0c; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,524))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,928))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x1c; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,540))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x1e; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,542))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,960))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x2e; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,558))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x30; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,560))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,992))) (%_% ymm0) *) + 0xc3 (* RET *) +];; +(*** BYTECODE END ***) + +let mldsa_polyz_unpack_17_tmc = + define_trimmed "mldsa_polyz_unpack_17_tmc" mldsa_polyz_unpack_17_mc;; + +let MLDSA_POLYZ_UNPACK_17_EXEC = X86_MK_CORE_EXEC_RULE mldsa_polyz_unpack_17_tmc;; + +(* ------------------------------------------------------------------------- *) +(* D=18 instantiations: 32 chunks of 8 coefficients (144-bit words), *) +(* one chunk per AVX2 block. *) +(* ------------------------------------------------------------------------- *) + +let NUM_OF_WORDLIST_SPLIT_18_256_8 = mk_split_theorem 18 256 8;; +let WORD_SUBWORD_NUM_OF_WORDLIST_CASES_D18 = mk_subword_cases 18 8;; + +(* One 256-bit AVX2 store reassembles 8 zunpack17 coefficients into the *) +(* num_of_wordlist of the mapped 8-element sublist. *) +let POLYZ17_STORE = prove + (`!sl:(18 word) list. LENGTH sl = 8 ==> + ((word_join:int128->int128->int256) + ((word_join:int64->int64->int128) + ((word_join:int32->int32->int64) + (zunpack17 (word_subword (word (num_of_wordlist sl):144 word) (126,18))) + (zunpack17 (word_subword (word (num_of_wordlist sl):144 word) (108,18)))) + ((word_join:int32->int32->int64) + (zunpack17 (word_subword (word (num_of_wordlist sl):144 word) (90,18))) + (zunpack17 (word_subword (word (num_of_wordlist sl):144 word) (72,18))))) + ((word_join:int64->int64->int128) + ((word_join:int32->int32->int64) + (zunpack17 (word_subword (word (num_of_wordlist sl):144 word) (54,18))) + (zunpack17 (word_subword (word (num_of_wordlist sl):144 word) (36,18)))) + ((word_join:int32->int32->int64) + (zunpack17 (word_subword (word (num_of_wordlist sl):144 word) (18,18))) + (zunpack17 (word_subword (word (num_of_wordlist sl):144 word) (0,18)))))) + = word(num_of_wordlist (MAP zunpack17 sl))`, + GEN_TAC THEN DISCH_TAC THEN + ASM_SIMP_TAC WORD_SUBWORD_NUM_OF_WORDLIST_CASES_D18 THEN + POP_ASSUM MP_TAC THEN + REWRITE_TAC[num_CONV `8`; num_CONV `7`; num_CONV `6`; num_CONV `5`; + num_CONV `4`; num_CONV `3`; num_CONV `2`; num_CONV `1`; + LENGTH_EQ_CONS; LENGTH_EQ_NIL] THEN + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[MAP] THEN + REWRITE_TAC[EL; HD; TL; num_CONV `7`; num_CONV `6`; num_CONV `5`; + num_CONV `4`; num_CONV `3`; num_CONV `2`; num_CONV `1`] THEN + REWRITE_TAC[EL; HD; TL; num_of_wordlist] THEN + CONV_TAC(DEPTH_CONV DIMINDEX_CONV) THEN CONV_TAC NUM_REDUCE_CONV THEN + CONV_TAC WORD_BLAST);; + +(* Re-fold the two bytes128 pieces back into subwords of the 144-bit chunk. *) +let X86_BASE_SIMPS_D18 = [ + prove(`word ((t:num) MOD 2 EXP 128) : 128 word = word_subword (word t : 144 word) (0,128)`, + REWRITE_TAC[GSYM VAL_EQ; VAL_WORD_SUBWORD; VAL_WORD; DIMINDEX_128] THEN + CONV_TAC(DEPTH_CONV DIMINDEX_CONV) THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[DIV_1; MOD_MOD_REFL] THEN + REWRITE_TAC[ARITH_RULE `340282366920938463463374607431768211456 = 2 EXP 128`; + ARITH_RULE `22300745198530623141535718272648361505980416 = 2 EXP 144`] THEN + SIMP_TAC[MOD_MOD; LE_EXP; ARITH_EQ; ARITH_RULE `128 <= 144`] THEN + REWRITE_TAC[MOD_MOD_EXP_MIN] THEN CONV_TAC NUM_REDUCE_CONV); + prove(`word ((t:num) DIV 2 EXP 16) : 128 word = word_subword (word t : 144 word) (16,128)`, + REWRITE_TAC[GSYM VAL_EQ; VAL_WORD_SUBWORD; VAL_WORD; DIMINDEX_128] THEN + CONV_TAC(DEPTH_CONV DIMINDEX_CONV) THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[DIV_MOD; GSYM EXP_ADD] THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[ARITH_RULE `22300745198530623141535718272648361505980416 = 2 EXP 144`; + ARITH_RULE `1461501637330902918203684832716283019655932542976 = 2 EXP 160`] THEN + REWRITE_TAC[MOD_MOD_EXP_MIN] THEN CONV_TAC NUM_REDUCE_CONV)];; + +(* Split a 144-bit chunk read into the two bytes128 loads the asm performs *) +(* (at offsets 0 and 2 within each 18-byte block). *) +let READ_MEMORY_WBYTES_SPLIT_144_X86 = prove + (`t < 2 EXP 144 + ==> (read (memory :> wbytes a) (s:x86state) = (word t : 144 word) <=> + read (memory :> bytes128 a) s = (word (t MOD 2 EXP 128) : int128) /\ + read (memory :> bytes128 (word_add a (word 2))) s = + (word (t DIV 2 EXP 16) : int128))`, + DISCH_TAC THEN + REWRITE_TAC[BYTES128_WBYTES; GSYM VAL_EQ; VAL_READ_WBYTES; READ_COMPONENT_COMPOSE] THEN + CONV_TAC(DEPTH_CONV DIMINDEX_CONV) THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[VAL_WORD] THEN CONV_TAC(DEPTH_CONV DIMINDEX_CONV) THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV THENC DEPTH_CONV NUM_MULT_CONV) + (INST [`2`,`k:num`; `16`,`l:num`] READ_BYTES_SPLIT_ANY)] THEN + REWRITE_TAC[CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV THENC DEPTH_CONV NUM_MULT_CONV) + (INST [mk_comb(mk_comb(`word_add:int64->int64->int64`,`a:int64`),`word 2:int64`),`a:int64`; + `14`,`k:num`; `2`,`l:num`] READ_BYTES_SPLIT_ANY)] THEN + REWRITE_TAC[CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV THENC DEPTH_CONV NUM_MULT_CONV) + (INST [`2`,`k:num`; `14`,`l:num`] READ_BYTES_SPLIT_ANY)] THEN + REWRITE_TAC[WORD_ADD_ASSOC_CONSTS] THEN CONV_TAC(DEPTH_CONV NUM_ADD_CONV) THEN + MP_TAC(ISPECL [`a:int64`; `2`; `read memory (s:x86state)`] READ_BYTES_BOUND) THEN + MP_TAC(ISPECL [`word_add a (word 2):int64`; `14`; `read memory (s:x86state)`] READ_BYTES_BOUND) THEN + MP_TAC(ISPECL [`word_add a (word 16):int64`; `2`; `read memory (s:x86state)`] READ_BYTES_BOUND) THEN + CONV_TAC NUM_REDUCE_CONV THEN + ABBREV_TAC `p0 = read (bytes (a,2)) (read memory (s:x86state))` THEN + ABBREV_TAC `p1 = read (bytes (word_add a (word 2),14)) (read memory (s:x86state))` THEN + ABBREV_TAC `p2 = read (bytes (word_add a (word 16),2)) (read memory (s:x86state))` THEN + POP_ASSUM(K ALL_TAC) THEN POP_ASSUM(K ALL_TAC) THEN POP_ASSUM(K ALL_TAC) THEN + REPEAT DISCH_TAC THEN + SUBGOAL_THEN `t MOD 22300745198530623141535718272648361505980416 = t` ASSUME_TAC THENL + [MATCH_MP_TAC MOD_LT THEN ASM_REWRITE_TAC[]; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN + ONCE_REWRITE_TAC[ARITH_RULE `340282366920938463463374607431768211456 = 2 EXP 128`; + ARITH_RULE `65536 = 2 EXP 16`; + ARITH_RULE `5192296858534827628530496329220096 = 2 EXP 112`] THEN + SIMP_TAC[MOD_MOD; LE_EXP; ARITH_EQ; ARITH_RULE `16 <= 128`] THEN + REWRITE_TAC[DIV_MOD; DIV_DIV; GSYM EXP_ADD; MOD_MOD_EXP_MIN] THEN + CONV_TAC NUM_REDUCE_CONV THEN (CONV_TAC TAUT ORELSE ASM_ARITH_TAC));; + +(* ------------------------------------------------------------------------- *) +(* zunpack17 lane folding for the VPSHUFB+VPSRLVD+VPAND+VPSUBD pipeline. *) +(* *) +(* After SIMD_SIMPLIFY each YMM0 lane is *) +(* word_sub (word 131072) (word_and (word_ushr sh) (word 262143)) *) +(* The masked, shifted byte-join selects an 18-bit field of the 128-bit *) +(* chunk half, so ZPRE17_LANE_CONV rewrites it to *) +(* word_zx (word_subword (off,18)) *) +(* via WORD_BLAST, and ZUNPACK17_CORRECT then folds the outer word_sub into *) +(* zunpack17, giving an atomic lane that VPSUBD/the store handle cheaply. *) +(* ------------------------------------------------------------------------- *) + +let ZPRE17_LANE_CONV tm = + (* the lane's byte slices come from a single chunk word; find its width *) + let is_src t = try fst(dest_type(type_of t)) = "word" && is_comb t && + name_of(rator t) = "word" && + (let w = Num.int_of_num(dest_finty(hd(snd(dest_type(type_of t))))) in + w = 128 || w = 144) + with _ -> false in + let src = find_term is_src tm in + let srcw = Num.int_of_num(dest_finty(hd(snd(dest_type(type_of src))))) in + let srcty = mk_finty(Num.num_of_int srcw) in + tryfind (fun off -> + let goal = mk_eq(tm, mk_comb(`word_zx:18 word->int32`, + mk_comb(mk_comb(inst[srcty,`:N`] `word_subword:N word->num#num->18 word`, src), + mk_pair(mk_small_numeral off, `18`)))) in + WORD_BLAST goal) (0--130);; + +let ZPRE17_FOLD_CONV = + DEPTH_CONV (fun t -> + if is_comb t && is_comb(rator t) && + (try name_of(rator(rator t)) = "word_and" with _ -> false) && + (try rand t = `word 262143:int32` with _ -> false) + then ZPRE17_LANE_CONV t else failwith "ZPRE17_FOLD_CONV");; + +(* Fold the YMM0 register read assumption to a word_join of 8 atomic *) +(* zunpack17(word_subword ...) lanes. Targets only a YMM0 read carrying the *) +(* post-VPSUBD shape (word_sub (word 131072) ...), so it is a cheap no-op on *) +(* every other step. *) +let ZUNPACK17_FOLD_TAC (asl,w as gl) = + let is_target th = + let c = concl th in + can (term_match [] `read YMM0 s = x`) c && + can (find_term (fun t -> t = `word 131072:int32`)) c in + (TRY(FIRST_X_ASSUM(fun th -> + if not(is_target th) then failwith "" else + ASSUME_TAC(CONV_RULE(RAND_CONV + (ZPRE17_FOLD_CONV THENC REWRITE_CONV[ZUNPACK17_CORRECT])) th)))) gl;; + +(* ------------------------------------------------------------------------- *) +(* Core correctness theorem *) +(* ------------------------------------------------------------------------- *) + +let MLDSA_POLYZ_UNPACK_17_CORRECT = prove + (`!r b (l:(18 word) list) pc. + aligned 32 r /\ + LENGTH l = 256 /\ + ALL (nonoverlapping (r,1024)) + [(word pc,1611); (b,576)] + ==> ensures x86 + (\s. bytes_loaded s (word pc) (BUTLAST mldsa_polyz_unpack_17_tmc) /\ + read RIP s = word pc /\ + C_ARGUMENTS [r; b] s /\ + read(memory :> bytes(b,576)) s = num_of_wordlist l) + (\s. read RIP s = word(pc + 1610) /\ + read(memory :> bytes(r,1024)) s = num_of_wordlist (MAP zunpack17 l)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(r,1024)])`, + MAP_EVERY X_GEN_TAC [`r:int64`; `b:int64`; `l:(18 word) list`; `pc:num`] THEN + REWRITE_TAC[C_ARGUMENTS; MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; + NONOVERLAPPING_CLAUSES; ALL; fst MLDSA_POLYZ_UNPACK_17_EXEC] THEN + STRIP_TAC THEN + ENSURES_INIT_TAC "s0" THEN + + (*** Expand input: 256 coeffs -> 32 chunks of 144-bit words ***) + UNDISCH_TAC `read(memory :> bytes(b,576)) s0 = num_of_wordlist(l:(18 word) list)` THEN + IMP_REWRITE_TAC [NUM_OF_WORDLIST_SPLIT_18_256_8] THEN + CONV_TAC (ONCE_DEPTH_CONV LIST_OF_SEQ_CONV) THEN + REWRITE_TAC [MAP; o_DEF] THEN + CONV_TAC(LAND_CONV BYTES_EQ_NUM_OF_WORDLIST_EXPAND_CONV) THEN + STRIP_TAC THEN + + (*** Split each 144-bit chunk into the two bytes128 loads ***) + REPEAT(FIRST_X_ASSUM(fun th -> + if can (term_match [] + `read (memory :> wbytes a) s = word t`) (concl th) + then MP_TAC th else NO_TAC)) THEN + IMP_REWRITE_TAC [READ_MEMORY_WBYTES_SPLIT_144_X86] THEN + MAP_EVERY (fun n -> SUBGOAL_THEN (subst[mk_small_numeral n,`k:num`] + `num_of_wordlist (SUB_LIST (8 * k,8) (l : (18 word) list)) < 2 EXP 144`) + (fun th -> REWRITE_TAC[th]) THENL [ + TRANS_TAC LTE_TRANS (subst[mk_small_numeral n,`k:num`] + `2 EXP (dimindex(:18) * LENGTH(SUB_LIST(8*k,8) (l : (18 word) list)))`) THEN + REWRITE_TAC[NUM_OF_WORDLIST_BOUND] THEN + REWRITE_TAC[LENGTH_SUB_LIST; DIMINDEX_CONV `dimindex (:18)`] THEN + ASM_SIMP_TAC [] THEN NUM_REDUCE_TAC; + ALL_TAC]) (0--31) THEN + (*** Normalise the high-half load addresses from the nested form ***) + (*** word_add (word_add b (word 18k)) (word 2) ***) + (*** to the reduced form word_add b (word (18k+2)) the stepper computes, ***) + (*** so each block's high-half load resolves and YMM0 becomes ground before ***) + (*** the store (otherwise DISCARD_OLDSTATE_TAC silently drops the store). ***) + CONV_TAC (GEN_REWRITE_CONV TOP_DEPTH_CONV [WORD_ADD_ASSOC_CONSTS] THENC + TOP_SWEEP_CONV NUM_ADD_CONV) THEN + REPEAT STRIP_TAC THEN + + (*** Express each chunk's two bytes128 input pieces as subwords of the ***) + (*** single 144-bit chunk word, so the per-lane VPSUBD operands compose ***) + (*** into a single word_subword and SIMD_SIMPLIFY stays cheap. ***) + RULE_ASSUM_TAC(REWRITE_RULE X86_BASE_SIMPS_D18) THEN + + (*** Symbolic execution: simplify each block's lanes, then fold the just- ***) + (*** computed YMM0 into atomic zunpack17 lanes before it is stored so the ***) + (*** store and subsequent steps stay cheap. ***) + MAP_EVERY (fun n -> + X86_STEPS_TAC MLDSA_POLYZ_UNPACK_17_EXEC [n] THEN + SIMD_SIMPLIFY_TAC [] THEN + ZUNPACK17_FOLD_TAC) (1--276) THEN + + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + + (*** Fold each 256-bit store into 8 atomic zunpack17 lanes ***) + RULE_ASSUM_TAC(CONV_RULE(TRY_CONV(RAND_CONV + (ZPRE17_FOLD_CONV THENC REWRITE_CONV[ZUNPACK17_CORRECT])))) THEN + + (*** Establish the 32 sublist-length facts the CASES rewrites need ***) + MAP_EVERY (fun i -> SUBGOAL_THEN + (subst [mk_small_numeral (8 * i), `i:num`] + `LENGTH (SUB_LIST (i, 8) (l : (18 word) list)) = 8`) ASSUME_TAC + THENL [ASM_REWRITE_TAC [LENGTH_SUB_LIST] THEN NUM_REDUCE_TAC; ALL_TAC]) + (0 -- 31) THEN + + (*** Express the spec RHS as 32 chunks and split the 1024-byte output read ***) + (*** into 32 matching 256-bit conjuncts. ***) + SUBGOAL_THEN `LENGTH(MAP zunpack17 (l:(18 word) list)) = 256` ASSUME_TAC THENL + [ASM_REWRITE_TAC[LENGTH_MAP]; ALL_TAC] THEN + FIRST_X_ASSUM(fun th -> if concl th = `LENGTH(MAP zunpack17 (l:(18 word) list)) = 256` + then GEN_REWRITE_TAC RAND_CONV [MATCH_MP NUM_OF_WORDLIST_SPLIT_32_256_8 th] THEN ASSUME_TAC th + else NO_TAC) THEN + CONV_TAC (ONCE_DEPTH_CONV LIST_OF_SEQ_CONV) THEN + REWRITE_TAC[MAP; o_DEF; GSYM MAP_SUB_LIST] THEN + CONV_TAC BYTES_EQ_NUM_OF_WORDLIST_EXPAND_CONV THEN + (*** Normalise 8*k indices to literals everywhere, convert the stores to ***) + (*** wbytes form, then rewrite each conjunct's spec RHS into the stored ***) + (*** word_join shape and discharge it against its store. ***) + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + RULE_ASSUM_TAC(CONV_RULE(ONCE_DEPTH_CONV NUM_MULT_CONV) o + REWRITE_RULE[BYTES256_WBYTES]) THEN + ASM_SIMP_TAC[GSYM POLYZ17_STORE]);; + +(* ------------------------------------------------------------------------- *) +(* Subroutine correctness *) +(* This must be kept in sync with the CBMC specification in *) +(* mldsa/src/native/x86_64/src/arith_native_x86_64.h *) +(* ------------------------------------------------------------------------- *) + +let MLDSA_POLYZ_UNPACK_17_NOIBT_SUBROUTINE_CORRECT = prove + (`!r b (l:(18 word) list) pc stackpointer returnaddress. + aligned 32 r /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_polyz_unpack_17_tmc)) + [(r,1024)] /\ + LENGTH l = 256 /\ + ALL (nonoverlapping (r,1024)) + [(word pc,LENGTH mldsa_polyz_unpack_17_tmc); (b,576)] /\ + nonoverlapping (stackpointer,8) (r,1024) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mldsa_polyz_unpack_17_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [r; b] s /\ + read(memory :> bytes(b,576)) s = num_of_wordlist l) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + read(memory :> bytes(r,1024)) s = num_of_wordlist (MAP zunpack17 l) /\ + (!i. i < 256 ==> + --(&(2 EXP 17) - &1) <= ival(EL i (MAP zunpack17 l)) /\ + ival(EL i (MAP zunpack17 l)) <= &(2 EXP 17))) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(r,1024)])`, + X86_PROMOTE_RETURN_NOSTACK_TAC mldsa_polyz_unpack_17_tmc + MLDSA_POLYZ_UNPACK_17_CORRECT THEN + REPEAT STRIP_TAC THEN + MP_TAC(ISPECL [`l:(18 word) list`; `i:num`] ZUNPACK17_MAP_BOUND) THEN + ASM_REWRITE_TAC[] THEN STRIP_TAC THEN ASM_REWRITE_TAC[]);; + +let MLDSA_POLYZ_UNPACK_17_SUBROUTINE_CORRECT = prove + (`!r b (l:(18 word) list) pc stackpointer returnaddress. + aligned 32 r /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_polyz_unpack_17_mc)) + [(r,1024)] /\ + LENGTH l = 256 /\ + ALL (nonoverlapping (r,1024)) + [(word pc,LENGTH mldsa_polyz_unpack_17_mc); (b,576)] /\ + nonoverlapping (stackpointer,8) (r,1024) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mldsa_polyz_unpack_17_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [r; b] s /\ + read(memory :> bytes(b,576)) s = num_of_wordlist l) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + read(memory :> bytes(r,1024)) s = num_of_wordlist (MAP zunpack17 l) /\ + (!i. i < 256 ==> + --(&(2 EXP 17) - &1) <= ival(EL i (MAP zunpack17 l)) /\ + ival(EL i (MAP zunpack17 l)) <= &(2 EXP 17))) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(r,1024)])`, + MATCH_ACCEPT_TAC(ADD_IBT_RULE MLDSA_POLYZ_UNPACK_17_NOIBT_SUBROUTINE_CORRECT));; + +(* ------------------------------------------------------------------------- *) +(* Constant-time and memory safety proof. *) +(* ------------------------------------------------------------------------- *) + +needs "s2n_bignum/x86/proofs/consttime.ml";; +needs "mldsa_native/x86_64/proofs/subroutine_signatures.ml";; + +let full_spec,public_vars = mk_safety_spec + ~keep_maychanges:true + (assoc "mldsa_polyz_unpack_17_x86" subroutine_signatures) + (REWRITE_RULE[SOME_FLAGS] MLDSA_POLYZ_UNPACK_17_CORRECT) + MLDSA_POLYZ_UNPACK_17_EXEC;; + +let MLDSA_POLYZ_UNPACK_17_SAFE = + REWRITE_RULE [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; SOME_FLAGS] + (time prove + (full_spec, + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; SOME_FLAGS] THEN + PROVE_SAFETY_SPEC_TAC ~public_vars:public_vars MLDSA_POLYZ_UNPACK_17_EXEC));; + +let MLDSA_POLYZ_UNPACK_17_NOIBT_SUBROUTINE_SAFE = time prove + (`exists f_events. + forall e r b (l:(18 word) list) pc stackpointer returnaddress. + aligned 32 r /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_polyz_unpack_17_tmc)) + [(r,1024)] /\ + LENGTH l = 256 /\ + ALL (nonoverlapping (r,1024)) + [(word pc,LENGTH mldsa_polyz_unpack_17_tmc); (b,576)] /\ + nonoverlapping (stackpointer,8) (r,1024) + ==> ensures x86 + (\s. + bytes_loaded s (word pc) mldsa_polyz_unpack_17_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [r; b] s /\ + read events s = e) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (exists e2. + read events s = APPEND e2 e /\ + e2 = f_events b r pc stackpointer returnaddress /\ + memaccess_inbounds e2 [b,576; r,1024; stackpointer,8] + [r,1024; stackpointer,8])) + (\s s'. true)`, + X86_PROMOTE_RETURN_NOSTACK_TAC mldsa_polyz_unpack_17_tmc + MLDSA_POLYZ_UNPACK_17_SAFE THEN + DISCHARGE_SAFETY_PROPERTY_TAC);; + +let MLDSA_POLYZ_UNPACK_17_SUBROUTINE_SAFE = time prove + (`exists f_events. + forall e r b (l:(18 word) list) pc stackpointer returnaddress. + aligned 32 r /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_polyz_unpack_17_mc)) + [(r,1024)] /\ + LENGTH l = 256 /\ + ALL (nonoverlapping (r,1024)) + [(word pc,LENGTH mldsa_polyz_unpack_17_mc); (b,576)] /\ + nonoverlapping (stackpointer,8) (r,1024) + ==> ensures x86 + (\s. + bytes_loaded s (word pc) mldsa_polyz_unpack_17_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [r; b] s /\ + read events s = e) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (exists e2. + read events s = APPEND e2 e /\ + e2 = f_events b r pc stackpointer returnaddress /\ + memaccess_inbounds e2 [b,576; r,1024; stackpointer,8] + [r,1024; stackpointer,8])) + (\s s'. true)`, + MATCH_ACCEPT_TAC(ADD_IBT_RULE MLDSA_POLYZ_UNPACK_17_NOIBT_SUBROUTINE_SAFE));; diff --git a/proofs/hol_light/x86_64/proofs/polyz_unpack_19_avx2_asm.ml b/proofs/hol_light/x86_64/proofs/polyz_unpack_19_avx2_asm.ml new file mode 100644 index 000000000..d24a80028 --- /dev/null +++ b/proofs/hol_light/x86_64/proofs/polyz_unpack_19_avx2_asm.ml @@ -0,0 +1,911 @@ +(* + * Copyright (c) The mldsa-native project authors + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 + *) + +(* ========================================================================= *) +(* Functional correctness of polyz_unpack_19 (x86_64 AVX2): *) +(* Unpack polynomial z with 20-bit packed coefficients (GAMMA1 = 2^19) *) +(* Maps packed [0, 2^20-1] to signed [-(2^19-1), 2^19] via GAMMA1 - x *) +(* (ML-DSA-65/87). *) +(* *) +(* The x86 routine builds the shuffle/shift/mask/gamma1 constants inline *) +(* (VMOVQ/VPINSRQ/VINSERTI128/VPBROADCASTD) and unpacks 8 coefficients per *) +(* block with VPSHUFB/VPSRLVD/VPAND/VPSUBD. *) +(* ========================================================================= *) + +needs "s2n_bignum/x86/proofs/base.ml";; +needs "mldsa_native/common/mldsa_specs.ml";; +needs "mldsa_native/x86_64/proofs/mldsa_utils.ml";; + +(**** print_literal_from_elf "x86_64/mldsa/polyz_unpack_19_avx2_asm.o";; + ****) + +let mldsa_polyz_unpack_19_mc = define_assert_from_elf + "mldsa_polyz_unpack_19_mc" "x86_64/mldsa/polyz_unpack_19_avx2_asm.o" +(*** BYTECODE START ***) +[ + 0xf3; 0x0f; 0x1e; 0xfa; (* ENDBR64 *) + 0x48; 0xb8; 0x00; 0x01; 0x02; 0xff; 0x02; 0x03; 0x04; 0xff; + (* MOV (% rax) (Imm64 (word 18375815690981605632)) *) + 0xc4; 0xe1; 0xf9; 0x6e; 0xc8; + (* VMOVQ (%_% xmm1) (% rax) *) + 0x48; 0xb8; 0x05; 0x06; 0x07; 0xff; 0x07; 0x08; 0x09; 0xff; + (* MOV (% rax) (Imm64 (word 18377228584898463237)) *) + 0xc4; 0xe3; 0xf1; 0x22; 0xc8; 0x01; + (* VPINSRQ (%_% xmm1) (%_% xmm1) (% rax) (Imm8 (word 1)) *) + 0x48; 0xb8; 0x16; 0x17; 0x18; 0xff; 0x18; 0x19; 0x1a; 0xff; + (* MOV (% rax) (Imm64 (word 18382032424215779094)) *) + 0xc4; 0xe1; 0xf9; 0x6e; 0xe8; + (* VMOVQ (%_% xmm5) (% rax) *) + 0x48; 0xb8; 0x1b; 0x1c; 0x1d; 0xff; 0x1d; 0x1e; 0x1f; 0xff; + (* MOV (% rax) (Imm64 (word 18383445318132636699)) *) + 0xc4; 0xe3; 0xd1; 0x22; 0xe8; 0x01; + (* VPINSRQ (%_% xmm5) (%_% xmm5) (% rax) (Imm8 (word 1)) *) + 0xc4; 0xe3; 0x75; 0x38; 0xcd; 0x01; + (* VINSERTI128 (%_% ymm1) (%_% ymm1) (%_% xmm5) (Imm8 (word 1)) *) + 0x48; 0xb8; 0x00; 0x00; 0x00; 0x00; 0x04; 0x00; 0x00; 0x00; + (* MOV (% rax) (Imm64 (word 17179869184)) *) + 0xc4; 0xe1; 0xf9; 0x6e; 0xd0; + (* VMOVQ (%_% xmm2) (% rax) *) + 0x48; 0xb8; 0x00; 0x00; 0x00; 0x00; 0x04; 0x00; 0x00; 0x00; + (* MOV (% rax) (Imm64 (word 17179869184)) *) + 0xc4; 0xe3; 0xe9; 0x22; 0xd0; 0x01; + (* VPINSRQ (%_% xmm2) (%_% xmm2) (% rax) (Imm8 (word 1)) *) + 0xc4; 0xe3; 0x6d; 0x38; 0xd2; 0x01; + (* VINSERTI128 (%_% ymm2) (%_% ymm2) (%_% xmm2) (Imm8 (word 1)) *) + 0xb8; 0xff; 0xff; 0x0f; 0x00; + (* MOV (% eax) (Imm32 (word 1048575)) *) + 0xc5; 0xf9; 0x6e; 0xd8; (* VMOVD (%_% xmm3) (% eax) *) + 0xc4; 0xe2; 0x7d; 0x58; 0xdb; + (* VPBROADCASTD (%_% ymm3) (%_% xmm3) *) + 0xb8; 0x00; 0x00; 0x08; 0x00; + (* MOV (% eax) (Imm32 (word 524288)) *) + 0xc5; 0xf9; 0x6e; 0xe0; (* VMOVD (%_% xmm4) (% eax) *) + 0xc4; 0xe2; 0x7d; 0x58; 0xe4; + (* VPBROADCASTD (%_% ymm4) (%_% xmm4) *) + 0xc5; 0xfa; 0x6f; 0x06; (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,0))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x04; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,4))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x07; (* VMOVDQA (Memop Word256 (%% (rdi,0))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x14; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,20))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x18; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,24))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x47; 0x20; + (* VMOVDQA (Memop Word256 (%% (rdi,32))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x28; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,40))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x2c; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,44))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x47; 0x40; + (* VMOVDQA (Memop Word256 (%% (rdi,64))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x3c; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,60))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x40; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,64))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x47; 0x60; + (* VMOVDQA (Memop Word256 (%% (rdi,96))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x50; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,80))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x54; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,84))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x80; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,128))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x64; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,100))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x68; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,104))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,160))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x46; 0x78; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,120))) *) + 0xc5; 0xfa; 0x6f; 0x6e; 0x7c; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,124))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xc0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,192))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x8c; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,140))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x90; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,144))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,224))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xa0; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,160))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xa4; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,164))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x00; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,256))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xb4; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,180))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xb8; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,184))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x20; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,288))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xc8; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,200))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xcc; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,204))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,320))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xdc; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,220))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xe0; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,224))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x60; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,352))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xf0; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,240))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xf4; 0x00; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,244))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,384))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x04; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,260))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x08; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,264))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xa0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,416))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x18; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,280))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x1c; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,284))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xc0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,448))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x2c; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,300))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x30; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,304))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,480))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x40; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,320))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x44; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,324))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x00; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,512))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x54; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,340))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x58; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,344))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,544))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x68; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,360))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x6c; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,364))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x40; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,576))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x7c; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,380))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x80; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,384))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x60; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,608))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x90; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,400))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x94; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,404))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x80; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,640))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xa4; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,420))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xa8; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,424))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xa0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,672))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xb8; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,440))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xbc; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,444))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xc0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,704))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xcc; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,460))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xd0; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,464))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xe0; 0x02; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,736))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xe0; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,480))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xe4; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,484))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x00; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,768))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0xf4; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,500))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0xf8; 0x01; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,504))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x20; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,800))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x08; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,520))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x0c; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,524))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x40; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,832))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x1c; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,540))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x20; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,544))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x60; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,864))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x30; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,560))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x34; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,564))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0x80; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,896))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x44; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,580))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x48; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,584))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xa0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,928))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x58; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,600))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x5c; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,604))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xc0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,960))) (%_% ymm0) *) + 0xc5; 0xfa; 0x6f; 0x86; 0x6c; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm0) (Memop Word128 (%% (rsi,620))) *) + 0xc5; 0xfa; 0x6f; 0xae; 0x70; 0x02; 0x00; 0x00; + (* VMOVDQU (%_% xmm5) (Memop Word128 (%% (rsi,624))) *) + 0xc4; 0xe3; 0x7d; 0x38; 0xc5; 0x01; + (* VINSERTI128 (%_% ymm0) (%_% ymm0) (%_% xmm5) (Imm8 (word 1)) *) + 0xc4; 0xe2; 0x7d; 0x00; 0xc1; + (* VPSHUFB (%_% ymm0) (%_% ymm0) (%_% ymm1) *) + 0xc4; 0xe2; 0x7d; 0x45; 0xc2; + (* VPSRLVD (%_% ymm0) (%_% ymm0) (%_% ymm2) *) + 0xc5; 0xfd; 0xdb; 0xc3; (* VPAND (%_% ymm0) (%_% ymm0) (%_% ymm3) *) + 0xc5; 0xdd; 0xfa; 0xc0; (* VPSUBD (%_% ymm0) (%_% ymm4) (%_% ymm0) *) + 0xc5; 0xfd; 0x7f; 0x87; 0xe0; 0x03; 0x00; 0x00; + (* VMOVDQA (Memop Word256 (%% (rdi,992))) (%_% ymm0) *) + 0xc3 (* RET *) +];; +(*** BYTECODE END ***) + +let mldsa_polyz_unpack_19_tmc = + define_trimmed "mldsa_polyz_unpack_19_tmc" mldsa_polyz_unpack_19_mc;; + +let MLDSA_POLYZ_UNPACK_19_EXEC = X86_MK_CORE_EXEC_RULE mldsa_polyz_unpack_19_tmc;; + +(* ------------------------------------------------------------------------- *) +(* D=20 instantiations: 32 chunks of 8 coefficients (160-bit words), *) +(* one chunk per AVX2 block. *) +(* ------------------------------------------------------------------------- *) + +let NUM_OF_WORDLIST_SPLIT_20_256_8 = mk_split_theorem 20 256 8;; +let WORD_SUBWORD_NUM_OF_WORDLIST_CASES_D20 = mk_subword_cases 20 8;; + +(* One 256-bit AVX2 store reassembles 8 zunpack19 coefficients into the *) +(* num_of_wordlist of the mapped 8-element sublist. *) +let POLYZ19_STORE = prove + (`!sl:(20 word) list. LENGTH sl = 8 ==> + ((word_join:int128->int128->int256) + ((word_join:int64->int64->int128) + ((word_join:int32->int32->int64) + (zunpack19 (word_subword (word (num_of_wordlist sl):160 word) (140,20))) + (zunpack19 (word_subword (word (num_of_wordlist sl):160 word) (120,20)))) + ((word_join:int32->int32->int64) + (zunpack19 (word_subword (word (num_of_wordlist sl):160 word) (100,20))) + (zunpack19 (word_subword (word (num_of_wordlist sl):160 word) (80,20))))) + ((word_join:int64->int64->int128) + ((word_join:int32->int32->int64) + (zunpack19 (word_subword (word (num_of_wordlist sl):160 word) (60,20))) + (zunpack19 (word_subword (word (num_of_wordlist sl):160 word) (40,20)))) + ((word_join:int32->int32->int64) + (zunpack19 (word_subword (word (num_of_wordlist sl):160 word) (20,20))) + (zunpack19 (word_subword (word (num_of_wordlist sl):160 word) (0,20)))))) + = word(num_of_wordlist (MAP zunpack19 sl))`, + GEN_TAC THEN DISCH_TAC THEN + ASM_SIMP_TAC WORD_SUBWORD_NUM_OF_WORDLIST_CASES_D20 THEN + POP_ASSUM MP_TAC THEN + REWRITE_TAC[num_CONV `8`; num_CONV `7`; num_CONV `6`; num_CONV `5`; + num_CONV `4`; num_CONV `3`; num_CONV `2`; num_CONV `1`; + LENGTH_EQ_CONS; LENGTH_EQ_NIL] THEN + REPEAT STRIP_TAC THEN ASM_REWRITE_TAC[MAP] THEN + REWRITE_TAC[EL; HD; TL; num_CONV `7`; num_CONV `6`; num_CONV `5`; + num_CONV `4`; num_CONV `3`; num_CONV `2`; num_CONV `1`] THEN + REWRITE_TAC[EL; HD; TL; num_of_wordlist] THEN + CONV_TAC(DEPTH_CONV DIMINDEX_CONV) THEN CONV_TAC NUM_REDUCE_CONV THEN + CONV_TAC WORD_BLAST);; + +(* Re-fold the two bytes128 pieces back into subwords of the 160-bit chunk. *) +let X86_BASE_SIMPS_D20 = [ + prove(`word ((t:num) MOD 2 EXP 128) : 128 word = word_subword (word t : 160 word) (0,128)`, + REWRITE_TAC[GSYM VAL_EQ; VAL_WORD_SUBWORD; VAL_WORD; DIMINDEX_128] THEN + CONV_TAC(DEPTH_CONV DIMINDEX_CONV) THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[DIV_1; MOD_MOD_REFL] THEN + REWRITE_TAC[ARITH_RULE `340282366920938463463374607431768211456 = 2 EXP 128`; + ARITH_RULE `1461501637330902918203684832716283019655932542976 = 2 EXP 160`] THEN + SIMP_TAC[MOD_MOD; LE_EXP; ARITH_EQ; ARITH_RULE `128 <= 160`] THEN + REWRITE_TAC[MOD_MOD_EXP_MIN] THEN CONV_TAC NUM_REDUCE_CONV); + prove(`word ((t:num) DIV 2 EXP 32) : 128 word = word_subword (word t : 160 word) (32,128)`, + REWRITE_TAC[GSYM VAL_EQ; VAL_WORD_SUBWORD; VAL_WORD; DIMINDEX_128] THEN + CONV_TAC(DEPTH_CONV DIMINDEX_CONV) THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[DIV_MOD; GSYM EXP_ADD] THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[ARITH_RULE `1461501637330902918203684832716283019655932542976 = 2 EXP 160`; + ARITH_RULE `6277101735386680763835789423207666416102355444464034512896 = 2 EXP 192`] THEN + REWRITE_TAC[MOD_MOD_EXP_MIN] THEN CONV_TAC NUM_REDUCE_CONV)];; + +(* Split a 160-bit chunk read into the two bytes128 loads the asm performs *) +(* (at offsets 0 and 4 within each 18-byte block). *) +let READ_MEMORY_WBYTES_SPLIT_160_X86 = prove + (`t < 2 EXP 160 + ==> (read (memory :> wbytes a) (s:x86state) = (word t : 160 word) <=> + read (memory :> bytes128 a) s = (word (t MOD 2 EXP 128) : int128) /\ + read (memory :> bytes128 (word_add a (word 4))) s = + (word (t DIV 2 EXP 32) : int128))`, + DISCH_TAC THEN + REWRITE_TAC[BYTES128_WBYTES; GSYM VAL_EQ; VAL_READ_WBYTES; READ_COMPONENT_COMPOSE] THEN + CONV_TAC(DEPTH_CONV DIMINDEX_CONV) THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[VAL_WORD] THEN CONV_TAC(DEPTH_CONV DIMINDEX_CONV) THEN CONV_TAC NUM_REDUCE_CONV THEN + REWRITE_TAC[CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV THENC DEPTH_CONV NUM_MULT_CONV) + (INST [`4`,`k:num`; `16`,`l:num`] READ_BYTES_SPLIT_ANY)] THEN + REWRITE_TAC[CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV THENC DEPTH_CONV NUM_MULT_CONV) + (INST [mk_comb(mk_comb(`word_add:int64->int64->int64`,`a:int64`),`word 4:int64`),`a:int64`; + `12`,`k:num`; `4`,`l:num`] READ_BYTES_SPLIT_ANY)] THEN + REWRITE_TAC[CONV_RULE (ONCE_DEPTH_CONV NUM_ADD_CONV THENC DEPTH_CONV NUM_MULT_CONV) + (INST [`4`,`k:num`; `12`,`l:num`] READ_BYTES_SPLIT_ANY)] THEN + REWRITE_TAC[WORD_ADD_ASSOC_CONSTS] THEN CONV_TAC(DEPTH_CONV NUM_ADD_CONV) THEN + MP_TAC(ISPECL [`a:int64`; `4`; `read memory (s:x86state)`] READ_BYTES_BOUND) THEN + MP_TAC(ISPECL [`word_add a (word 4):int64`; `12`; `read memory (s:x86state)`] READ_BYTES_BOUND) THEN + MP_TAC(ISPECL [`word_add a (word 16):int64`; `4`; `read memory (s:x86state)`] READ_BYTES_BOUND) THEN + CONV_TAC NUM_REDUCE_CONV THEN + ABBREV_TAC `p0 = read (bytes (a,4)) (read memory (s:x86state))` THEN + ABBREV_TAC `p1 = read (bytes (word_add a (word 4),12)) (read memory (s:x86state))` THEN + ABBREV_TAC `p2 = read (bytes (word_add a (word 16),4)) (read memory (s:x86state))` THEN + POP_ASSUM(K ALL_TAC) THEN POP_ASSUM(K ALL_TAC) THEN POP_ASSUM(K ALL_TAC) THEN + REPEAT DISCH_TAC THEN + SUBGOAL_THEN `t MOD 1461501637330902918203684832716283019655932542976 = t` ASSUME_TAC THENL + [MATCH_MP_TAC MOD_LT THEN ASM_REWRITE_TAC[] THEN + REWRITE_TAC[ARITH_RULE `1461501637330902918203684832716283019655932542976 = 2 EXP 160`] THEN + ASM_REWRITE_TAC[]; ALL_TAC] THEN + ASM_REWRITE_TAC[] THEN + ONCE_REWRITE_TAC[ARITH_RULE `340282366920938463463374607431768211456 = 2 EXP 128`; + ARITH_RULE `4294967296 = 2 EXP 32`; + ARITH_RULE `79228162514264337593543950336 = 2 EXP 96`] THEN + SIMP_TAC[MOD_MOD; LE_EXP; ARITH_EQ; ARITH_RULE `32 <= 128`] THEN + REWRITE_TAC[DIV_MOD; DIV_DIV; GSYM EXP_ADD; MOD_MOD_EXP_MIN] THEN + CONV_TAC NUM_REDUCE_CONV THEN (CONV_TAC TAUT ORELSE ASM_ARITH_TAC));; + +(* ------------------------------------------------------------------------- *) +(* zunpack19 lane folding for the VPSHUFB+VPSRLVD+VPAND+VPSUBD pipeline. *) +(* *) +(* After SIMD_SIMPLIFY each YMM0 lane is *) +(* word_sub (word 524288) (word_and (word_ushr sh) (word 1048575)) *) +(* The masked, shifted byte-join selects a 20-bit field of the 128-bit *) +(* chunk half, so ZPRE19_LANE_CONV rewrites it to *) +(* word_zx (word_subword (off,20)) *) +(* via WORD_BLAST, and ZUNPACK19_CORRECT then folds the outer word_sub into *) +(* zunpack19, giving an atomic lane that VPSUBD/the store handle cheaply. *) +(* ------------------------------------------------------------------------- *) + +let ZPRE19_LANE_CONV tm = + (* the lane's byte slices come from a single chunk word; find its width *) + let is_src t = try fst(dest_type(type_of t)) = "word" && is_comb t && + name_of(rator t) = "word" && + (let w = Num.int_of_num(dest_finty(hd(snd(dest_type(type_of t))))) in + w = 128 || w = 160) + with _ -> false in + let src = find_term is_src tm in + let srcw = Num.int_of_num(dest_finty(hd(snd(dest_type(type_of src))))) in + let srcty = mk_finty(Num.num_of_int srcw) in + tryfind (fun off -> + let goal = mk_eq(tm, mk_comb(`word_zx:20 word->int32`, + mk_comb(mk_comb(inst[srcty,`:N`] `word_subword:N word->num#num->20 word`, src), + mk_pair(mk_small_numeral off, `20`)))) in + WORD_BLAST goal) (0--150);; + +let ZPRE19_FOLD_CONV = + DEPTH_CONV (fun t -> + if is_comb t && is_comb(rator t) && + (try name_of(rator(rator t)) = "word_and" with _ -> false) && + (try rand t = `word 1048575:int32` with _ -> false) + then ZPRE19_LANE_CONV t else failwith "ZPRE19_FOLD_CONV");; + +(* Fold the YMM0 register read assumption to a word_join of 8 atomic *) +(* zunpack19(word_subword ...) lanes. Targets only a YMM0 read carrying the *) +(* post-VPSUBD shape (word_sub (word 524288) ...), so it is a cheap no-op on *) +(* every other step. *) +let ZUNPACK19_FOLD_TAC (asl,w as gl) = + let is_target th = + let c = concl th in + can (term_match [] `read YMM0 s = x`) c && + can (find_term (fun t -> t = `word 524288:int32`)) c in + (TRY(FIRST_X_ASSUM(fun th -> + if not(is_target th) then failwith "" else + ASSUME_TAC(CONV_RULE(RAND_CONV + (ZPRE19_FOLD_CONV THENC REWRITE_CONV[ZUNPACK19_CORRECT])) th)))) gl;; + +(* ------------------------------------------------------------------------- *) +(* Core correctness theorem *) +(* ------------------------------------------------------------------------- *) + +let MLDSA_POLYZ_UNPACK_19_CORRECT = prove + (`!r b (l:(20 word) list) pc. + aligned 32 r /\ + LENGTH l = 256 /\ + ALL (nonoverlapping (r,1024)) + [(word pc,1614); (b,640)] + ==> ensures x86 + (\s. bytes_loaded s (word pc) (BUTLAST mldsa_polyz_unpack_19_tmc) /\ + read RIP s = word pc /\ + C_ARGUMENTS [r; b] s /\ + read(memory :> bytes(b,640)) s = num_of_wordlist l) + (\s. read RIP s = word(pc + 1613) /\ + read(memory :> bytes(r,1024)) s = num_of_wordlist (MAP zunpack19 l)) + (MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(r,1024)])`, + MAP_EVERY X_GEN_TAC [`r:int64`; `b:int64`; `l:(20 word) list`; `pc:num`] THEN + REWRITE_TAC[C_ARGUMENTS; MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; + NONOVERLAPPING_CLAUSES; ALL; fst MLDSA_POLYZ_UNPACK_19_EXEC] THEN + STRIP_TAC THEN + ENSURES_INIT_TAC "s0" THEN + + (*** Expand input: 256 coeffs -> 32 chunks of 160-bit words ***) + UNDISCH_TAC `read(memory :> bytes(b,640)) s0 = num_of_wordlist(l:(20 word) list)` THEN + IMP_REWRITE_TAC [NUM_OF_WORDLIST_SPLIT_20_256_8] THEN + CONV_TAC (ONCE_DEPTH_CONV LIST_OF_SEQ_CONV) THEN + REWRITE_TAC [MAP; o_DEF] THEN + CONV_TAC(LAND_CONV BYTES_EQ_NUM_OF_WORDLIST_EXPAND_CONV) THEN + STRIP_TAC THEN + + (*** Split each 160-bit chunk into the two bytes128 loads ***) + REPEAT(FIRST_X_ASSUM(fun th -> + if can (term_match [] + `read (memory :> wbytes a) s = word t`) (concl th) + then MP_TAC th else NO_TAC)) THEN + IMP_REWRITE_TAC [READ_MEMORY_WBYTES_SPLIT_160_X86] THEN + MAP_EVERY (fun n -> SUBGOAL_THEN (subst[mk_small_numeral n,`k:num`] + `num_of_wordlist (SUB_LIST (8 * k,8) (l : (20 word) list)) < 2 EXP 160`) + (fun th -> REWRITE_TAC[th]) THENL [ + TRANS_TAC LTE_TRANS (subst[mk_small_numeral n,`k:num`] + `2 EXP (dimindex(:20) * LENGTH(SUB_LIST(8*k,8) (l : (20 word) list)))`) THEN + REWRITE_TAC[NUM_OF_WORDLIST_BOUND] THEN + REWRITE_TAC[LENGTH_SUB_LIST; DIMINDEX_CONV `dimindex (:20)`] THEN + ASM_SIMP_TAC [] THEN NUM_REDUCE_TAC; + ALL_TAC]) (0--31) THEN + (*** Normalise the high-half load addresses from the nested form ***) + (*** word_add (word_add b (word 20k)) (word 4) ***) + (*** to the reduced form word_add b (word (20k+4)) the stepper computes, ***) + (*** so each block's high-half load resolves and YMM0 becomes ground before ***) + (*** the store (otherwise DISCARD_OLDSTATE_TAC silently drops the store). ***) + CONV_TAC (GEN_REWRITE_CONV TOP_DEPTH_CONV [WORD_ADD_ASSOC_CONSTS] THENC + TOP_SWEEP_CONV NUM_ADD_CONV) THEN + REPEAT STRIP_TAC THEN + + (*** Express each chunk's two bytes128 input pieces as subwords of the ***) + (*** single 160-bit chunk word, so the per-lane VPSUBD operands compose ***) + (*** into a single word_subword and SIMD_SIMPLIFY stays cheap. ***) + RULE_ASSUM_TAC(REWRITE_RULE X86_BASE_SIMPS_D20) THEN + + (*** Symbolic execution: simplify each block's lanes, then fold the just- ***) + (*** computed YMM0 into atomic zunpack19 lanes before it is stored so the ***) + (*** store and subsequent steps stay cheap. ***) + MAP_EVERY (fun n -> + X86_STEPS_TAC MLDSA_POLYZ_UNPACK_19_EXEC [n] THEN + SIMD_SIMPLIFY_TAC [] THEN + ZUNPACK19_FOLD_TAC) (1--276) THEN + + ENSURES_FINAL_STATE_TAC THEN ASM_REWRITE_TAC[] THEN + + (*** Fold each 256-bit store into 8 atomic zunpack19 lanes ***) + RULE_ASSUM_TAC(CONV_RULE(TRY_CONV(RAND_CONV + (ZPRE19_FOLD_CONV THENC REWRITE_CONV[ZUNPACK19_CORRECT])))) THEN + + (*** Establish the 32 sublist-length facts the CASES rewrites need ***) + MAP_EVERY (fun i -> SUBGOAL_THEN + (subst [mk_small_numeral (8 * i), `i:num`] + `LENGTH (SUB_LIST (i, 8) (l : (20 word) list)) = 8`) ASSUME_TAC + THENL [ASM_REWRITE_TAC [LENGTH_SUB_LIST] THEN NUM_REDUCE_TAC; ALL_TAC]) + (0 -- 31) THEN + + (*** Express the spec RHS as 32 chunks and split the 1024-byte output read ***) + (*** into 32 matching 256-bit conjuncts. ***) + SUBGOAL_THEN `LENGTH(MAP zunpack19 (l:(20 word) list)) = 256` ASSUME_TAC THENL + [ASM_REWRITE_TAC[LENGTH_MAP]; ALL_TAC] THEN + FIRST_X_ASSUM(fun th -> if concl th = `LENGTH(MAP zunpack19 (l:(20 word) list)) = 256` + then GEN_REWRITE_TAC RAND_CONV [MATCH_MP NUM_OF_WORDLIST_SPLIT_32_256_8 th] THEN ASSUME_TAC th + else NO_TAC) THEN + CONV_TAC (ONCE_DEPTH_CONV LIST_OF_SEQ_CONV) THEN + REWRITE_TAC[MAP; o_DEF; GSYM MAP_SUB_LIST] THEN + CONV_TAC BYTES_EQ_NUM_OF_WORDLIST_EXPAND_CONV THEN + (*** Normalise 8*k indices to literals everywhere, convert the stores to ***) + (*** wbytes form, then rewrite each conjunct's spec RHS into the stored ***) + (*** word_join shape and discharge it against its store. ***) + CONV_TAC(ONCE_DEPTH_CONV NUM_MULT_CONV) THEN + RULE_ASSUM_TAC(CONV_RULE(ONCE_DEPTH_CONV NUM_MULT_CONV) o + REWRITE_RULE[BYTES256_WBYTES]) THEN + ASM_SIMP_TAC[GSYM POLYZ19_STORE]);; + +(* ------------------------------------------------------------------------- *) +(* Subroutine correctness *) +(* This must be kept in sync with the CBMC specification in *) +(* mldsa/src/native/x86_64/src/arith_native_x86_64.h *) +(* ------------------------------------------------------------------------- *) + +let MLDSA_POLYZ_UNPACK_19_NOIBT_SUBROUTINE_CORRECT = prove + (`!r b (l:(20 word) list) pc stackpointer returnaddress. + aligned 32 r /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_polyz_unpack_19_tmc)) + [(r,1024)] /\ + LENGTH l = 256 /\ + ALL (nonoverlapping (r,1024)) + [(word pc,LENGTH mldsa_polyz_unpack_19_tmc); (b,640)] /\ + nonoverlapping (stackpointer,8) (r,1024) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mldsa_polyz_unpack_19_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [r; b] s /\ + read(memory :> bytes(b,640)) s = num_of_wordlist l) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + read(memory :> bytes(r,1024)) s = num_of_wordlist (MAP zunpack19 l) /\ + (!i. i < 256 ==> + --(&(2 EXP 19) - &1) <= ival(EL i (MAP zunpack19 l)) /\ + ival(EL i (MAP zunpack19 l)) <= &(2 EXP 19))) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(r,1024)])`, + X86_PROMOTE_RETURN_NOSTACK_TAC mldsa_polyz_unpack_19_tmc + MLDSA_POLYZ_UNPACK_19_CORRECT THEN + REPEAT STRIP_TAC THEN + MP_TAC(ISPECL [`l:(20 word) list`; `i:num`] ZUNPACK19_MAP_BOUND) THEN + ASM_REWRITE_TAC[] THEN STRIP_TAC THEN ASM_REWRITE_TAC[]);; + +let MLDSA_POLYZ_UNPACK_19_SUBROUTINE_CORRECT = prove + (`!r b (l:(20 word) list) pc stackpointer returnaddress. + aligned 32 r /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_polyz_unpack_19_mc)) + [(r,1024)] /\ + LENGTH l = 256 /\ + ALL (nonoverlapping (r,1024)) + [(word pc,LENGTH mldsa_polyz_unpack_19_mc); (b,640)] /\ + nonoverlapping (stackpointer,8) (r,1024) + ==> ensures x86 + (\s. bytes_loaded s (word pc) mldsa_polyz_unpack_19_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [r; b] s /\ + read(memory :> bytes(b,640)) s = num_of_wordlist l) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + read(memory :> bytes(r,1024)) s = num_of_wordlist (MAP zunpack19 l) /\ + (!i. i < 256 ==> + --(&(2 EXP 19) - &1) <= ival(EL i (MAP zunpack19 l)) /\ + ival(EL i (MAP zunpack19 l)) <= &(2 EXP 19))) + (MAYCHANGE [RSP] ,, MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI ,, + MAYCHANGE [memory :> bytes(r,1024)])`, + MATCH_ACCEPT_TAC(ADD_IBT_RULE MLDSA_POLYZ_UNPACK_19_NOIBT_SUBROUTINE_CORRECT));; + +(* ------------------------------------------------------------------------- *) +(* Constant-time and memory safety proof. *) +(* ------------------------------------------------------------------------- *) + +needs "s2n_bignum/x86/proofs/consttime.ml";; +needs "mldsa_native/x86_64/proofs/subroutine_signatures.ml";; + +let full_spec,public_vars = mk_safety_spec + ~keep_maychanges:true + (assoc "mldsa_polyz_unpack_19_x86" subroutine_signatures) + (REWRITE_RULE[SOME_FLAGS] MLDSA_POLYZ_UNPACK_19_CORRECT) + MLDSA_POLYZ_UNPACK_19_EXEC;; + +let MLDSA_POLYZ_UNPACK_19_SAFE = + REWRITE_RULE [MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; SOME_FLAGS] + (time prove + (full_spec, + REWRITE_TAC[MAYCHANGE_REGS_AND_FLAGS_PERMITTED_BY_ABI; SOME_FLAGS] THEN + PROVE_SAFETY_SPEC_TAC ~public_vars:public_vars MLDSA_POLYZ_UNPACK_19_EXEC));; + +let MLDSA_POLYZ_UNPACK_19_NOIBT_SUBROUTINE_SAFE = time prove + (`exists f_events. + forall e r b (l:(20 word) list) pc stackpointer returnaddress. + aligned 32 r /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_polyz_unpack_19_tmc)) + [(r,1024)] /\ + LENGTH l = 256 /\ + ALL (nonoverlapping (r,1024)) + [(word pc,LENGTH mldsa_polyz_unpack_19_tmc); (b,640)] /\ + nonoverlapping (stackpointer,8) (r,1024) + ==> ensures x86 + (\s. + bytes_loaded s (word pc) mldsa_polyz_unpack_19_tmc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [r; b] s /\ + read events s = e) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (exists e2. + read events s = APPEND e2 e /\ + e2 = f_events b r pc stackpointer returnaddress /\ + memaccess_inbounds e2 [b,640; r,1024; stackpointer,8] + [r,1024; stackpointer,8])) + (\s s'. true)`, + X86_PROMOTE_RETURN_NOSTACK_TAC mldsa_polyz_unpack_19_tmc + MLDSA_POLYZ_UNPACK_19_SAFE THEN + DISCHARGE_SAFETY_PROPERTY_TAC);; + +let MLDSA_POLYZ_UNPACK_19_SUBROUTINE_SAFE = time prove + (`exists f_events. + forall e r b (l:(20 word) list) pc stackpointer returnaddress. + aligned 32 r /\ + ALL (nonoverlapping (word pc, LENGTH mldsa_polyz_unpack_19_mc)) + [(r,1024)] /\ + LENGTH l = 256 /\ + ALL (nonoverlapping (r,1024)) + [(word pc,LENGTH mldsa_polyz_unpack_19_mc); (b,640)] /\ + nonoverlapping (stackpointer,8) (r,1024) + ==> ensures x86 + (\s. + bytes_loaded s (word pc) mldsa_polyz_unpack_19_mc /\ + read RIP s = word pc /\ + read RSP s = stackpointer /\ + read (memory :> bytes64 stackpointer) s = returnaddress /\ + C_ARGUMENTS [r; b] s /\ + read events s = e) + (\s. read RIP s = returnaddress /\ + read RSP s = word_add stackpointer (word 8) /\ + (exists e2. + read events s = APPEND e2 e /\ + e2 = f_events b r pc stackpointer returnaddress /\ + memaccess_inbounds e2 [b,640; r,1024; stackpointer,8] + [r,1024; stackpointer,8])) + (\s s'. true)`, + MATCH_ACCEPT_TAC(ADD_IBT_RULE MLDSA_POLYZ_UNPACK_19_NOIBT_SUBROUTINE_SAFE));; diff --git a/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml b/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml index d9823c3b3..2695535a3 100644 --- a/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml +++ b/proofs/hol_light/x86_64/proofs/subroutine_signatures.ml @@ -10,6 +10,36 @@ (* ========================================================================= *) let subroutine_signatures = [ +("mldsa_polyz_unpack_17_x86", + ([(*args*) + ("r", "int32_t[static 256]", (*is const?*)"false"); + ("a", "uint8_t[static 576]", (*is const?*)"true"); + ], + "void", + [(* input buffers *) + ("a", "576"(* num elems *), 1(* elem bytesize *)); + ], + [(* output buffers *) + ("r", "256"(* num elems *), 4(* elem bytesize *)); + ], + [(* temporary buffers *) + ]) +); +("mldsa_polyz_unpack_19_x86", + ([(*args*) + ("r", "int32_t[static 256]", (*is const?*)"false"); + ("a", "uint8_t[static 640]", (*is const?*)"true"); + ], + "void", + [(* input buffers *) + ("a", "640"(* num elems *), 1(* elem bytesize *)); + ], + [(* output buffers *) + ("r", "256"(* num elems *), 4(* elem bytesize *)); + ], + [(* temporary buffers *) + ]) +); ("mldsa_poly_caddq_x86", ([(*args*) ("a", "int32_t[static 256]", (*is const?*)"false"); diff --git a/scripts/autogen b/scripts/autogen index 9109ba569..3ce73146f 100755 --- a/scripts/autogen +++ b/scripts/autogen @@ -2844,6 +2844,18 @@ def hol_light_asm_joblist(): f"-Imldsa/src/native/x86_64/src -Imldsa/src/common.h {x86_64_flags}", "x86_64", ), + ( + "polyz_unpack_17_avx2_asm.S", + "dev/x86_64/src", + f"-Imldsa/src/native/x86_64/src -Imldsa/src/common.h {x86_64_flags}", + "x86_64", + ), + ( + "polyz_unpack_19_avx2_asm.S", + "dev/x86_64/src", + f"-Imldsa/src/native/x86_64/src -Imldsa/src/common.h {x86_64_flags}", + "x86_64", + ), ( "pointwise_acc_l4_avx2_asm.S", "dev/x86_64/src",