Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/hol_light.yml
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,10 @@ jobs:
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"]
- name: poly_chknorm_avx2_asm
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"]
- name: polyz_unpack_17_avx2_asm
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"]
- name: polyz_unpack_19_avx2_asm
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "subroutine_signatures.ml"]
- name: ntt_avx2_asm
needs: ["mldsa_specs.ml", "mldsa_utils.ml", "mldsa_zetas.ml", "subroutine_signatures.ml"]
- name: intt_avx2_asm
Expand Down
10 changes: 6 additions & 4 deletions BIBLIOGRAPHY.md
Original file line number Diff line number Diff line change
Expand Up @@ -270,8 +270,8 @@ source code and documentation.
- [dev/x86_64/src/poly_decompose_88_avx2.c](dev/x86_64/src/poly_decompose_88_avx2.c)
- [dev/x86_64/src/poly_use_hint_32_avx2.c](dev/x86_64/src/poly_use_hint_32_avx2.c)
- [dev/x86_64/src/poly_use_hint_88_avx2.c](dev/x86_64/src/poly_use_hint_88_avx2.c)
- [dev/x86_64/src/polyz_unpack_17_avx2.c](dev/x86_64/src/polyz_unpack_17_avx2.c)
- [dev/x86_64/src/polyz_unpack_19_avx2.c](dev/x86_64/src/polyz_unpack_19_avx2.c)
- [dev/x86_64/src/polyz_unpack_17_avx2_asm.S](dev/x86_64/src/polyz_unpack_17_avx2_asm.S)
- [dev/x86_64/src/polyz_unpack_19_avx2_asm.S](dev/x86_64/src/polyz_unpack_19_avx2_asm.S)
- [dev/x86_64/src/rej_uniform_avx2.c](dev/x86_64/src/rej_uniform_avx2.c)
- [dev/x86_64/src/rej_uniform_eta2_avx2.c](dev/x86_64/src/rej_uniform_eta2_avx2.c)
- [dev/x86_64/src/rej_uniform_eta4_avx2.c](dev/x86_64/src/rej_uniform_eta4_avx2.c)
Expand All @@ -288,8 +288,8 @@ source code and documentation.
- [mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c](mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c)
- [mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c)
- [mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c](mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c)
- [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c)
- [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c)
- [mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S](mldsa/src/native/x86_64/src/polyz_unpack_17_avx2_asm.S)
- [mldsa/src/native/x86_64/src/polyz_unpack_19_avx2_asm.S](mldsa/src/native/x86_64/src/polyz_unpack_19_avx2_asm.S)
- [mldsa/src/native/x86_64/src/rej_uniform_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_avx2.c)
- [mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c)
- [mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c](mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c)
Expand All @@ -302,6 +302,8 @@ source code and documentation.
- [proofs/hol_light/x86_64/mldsa/pointwise_avx2_asm.S](proofs/hol_light/x86_64/mldsa/pointwise_avx2_asm.S)
- [proofs/hol_light/x86_64/mldsa/poly_caddq_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_caddq_avx2_asm.S)
- [proofs/hol_light/x86_64/mldsa/poly_chknorm_avx2_asm.S](proofs/hol_light/x86_64/mldsa/poly_chknorm_avx2_asm.S)
- [proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S](proofs/hol_light/x86_64/mldsa/polyz_unpack_17_avx2_asm.S)
- [proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S](proofs/hol_light/x86_64/mldsa/polyz_unpack_19_avx2_asm.S)

### `Round3_Spec`

Expand Down
4 changes: 2 additions & 2 deletions dev/x86_64/meta.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ static MLD_INLINE int mld_polyz_unpack_17_native(int32_t *r, const uint8_t *a)
{
return MLD_NATIVE_FUNC_FALLBACK;
}
mld_polyz_unpack_17_avx2(r, a);
mld_polyz_unpack_17_avx2_asm(r, a);
return MLD_NATIVE_FUNC_SUCCESS;
}
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44 \
Expand All @@ -250,7 +250,7 @@ static MLD_INLINE int mld_polyz_unpack_19_native(int32_t *r, const uint8_t *a)
{
return MLD_NATIVE_FUNC_FALLBACK;
}
mld_polyz_unpack_19_avx2(r, a);
mld_polyz_unpack_19_avx2_asm(r, a);
return MLD_NATIVE_FUNC_SUCCESS;
}
#endif /* MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
Expand Down
26 changes: 22 additions & 4 deletions dev/x86_64/src/arith_native_x86_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,11 +138,29 @@ __contract__(
);

#if !defined(MLD_CONFIG_NO_SIGN_API) || !defined(MLD_CONFIG_NO_VERIFY_API)
#define mld_polyz_unpack_17_avx2 MLD_NAMESPACE(mld_polyz_unpack_17_avx2)
void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a);
#define mld_polyz_unpack_17_avx2_asm MLD_NAMESPACE(polyz_unpack_17_avx2_asm)
MLD_SYSV_ABI
void mld_polyz_unpack_17_avx2_asm(int32_t *r, const uint8_t *a)
/* This must be kept in sync with the HOL-Light specification
* in proofs/hol_light/x86_64/proofs/polyz_unpack_17_avx2_asm.ml */
__contract__(
requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
requires(memory_no_alias(a, 576))
assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
ensures(array_bound(r, 0, MLDSA_N, -((1 << 17) - 1), (1 << 17) + 1))
);

#define mld_polyz_unpack_19_avx2 MLD_NAMESPACE(mld_polyz_unpack_19_avx2)
void mld_polyz_unpack_19_avx2(int32_t *r, const uint8_t *a);
#define mld_polyz_unpack_19_avx2_asm MLD_NAMESPACE(polyz_unpack_19_avx2_asm)
MLD_SYSV_ABI
void mld_polyz_unpack_19_avx2_asm(int32_t *r, const uint8_t *a)
/* This must be kept in sync with the HOL-Light specification
* in proofs/hol_light/x86_64/proofs/polyz_unpack_19_avx2_asm.ml */
__contract__(
requires(memory_no_alias(r, sizeof(int32_t) * MLDSA_N))
requires(memory_no_alias(a, 640))
assigns(memory_slice(r, sizeof(int32_t) * MLDSA_N))
ensures(array_bound(r, 0, MLDSA_N, -((1 << 19) - 1), (1 << 19) + 1))
);
#endif /* !MLD_CONFIG_NO_SIGN_API || !MLD_CONFIG_NO_VERIFY_API */

#define mld_pointwise_avx2_asm MLD_NAMESPACE(pointwise_avx2_asm)
Expand Down
94 changes: 0 additions & 94 deletions dev/x86_64/src/polyz_unpack_17_avx2.c

This file was deleted.

135 changes: 135 additions & 0 deletions dev/x86_64/src/polyz_unpack_17_avx2_asm.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
* Copyright (c) The mldsa-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/

/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Dilithium optimized AVX2 implementation
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/dilithium/tree/master/avx2
*/

/*
* This file is derived from the public domain
* AVX2 Dilithium implementation @[REF_AVX2].
*/


/*************************************************
* Name: mld_polyz_unpack_17_avx2_asm
*
* Description: Unpack polynomial z with 18-bit packed coefficients
* (GAMMA1 = 2^17). Maps packed [0, 2^18-1] to signed
* [-(2^17-1), 2^17] via GAMMA1 - x.
*
* Arguments: - int32_t *r: pointer to output polynomial (1024 bytes)
* - const uint8_t *a: pointer to packed input (576 bytes)
**************************************************/

#include "../../../common.h"

#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
(!defined(MLD_CONFIG_NO_SIGN_API) || \
!defined(MLD_CONFIG_NO_VERIFY_API)) && \
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
MLD_CONFIG_PARAMETER_SET == 44)

/* simpasm: header-end */

/* Unpack one block of 8 coefficients (18 packed input bytes -> 8 x int32).
*
* Load bytes [18i .. 18i+15] into the low 128-bit half and [18i+2 .. 18i+17]
* into the high 128-bit half, then replicate the AVX2 intrinsic pipeline:
* byte shuffle, per-lane variable right shift, mask to 18 bits, and
* gamma1 - x. */
.macro unpack17_block i
vmovdqu (18*\i)(%rsi), %xmm0
vmovdqu (18*\i + 2)(%rsi), %xmm5
vinserti128 $1, %xmm5, %ymm0, %ymm0
vpshufb %ymm1, %ymm0, %ymm0
vpsrlvd %ymm2, %ymm0, %ymm0
vpand %ymm3, %ymm0, %ymm0
vpsubd %ymm0, %ymm4, %ymm0
vmovdqa %ymm0, (32*\i)(%rdi)
.endm

.text
.global MLD_ASM_NAMESPACE(polyz_unpack_17_avx2_asm)
.balign 16
MLD_ASM_FN_SYMBOL(polyz_unpack_17_avx2_asm)

/* Build the 256-bit byte-shuffle constant (shufbidx) in %ymm1.
* Low 128: {00,01,02,FF, 02,03,04,FF, 04,05,06,FF, 06,07,08,FF}
* High 128: {17,18,19,FF, 19,1A,1B,FF, 1B,1C,1D,FF, 1D,1E,1F,FF} */
movabs $0xFF040302FF020100, %rax
vmovq %rax, %xmm1
movabs $0xFF080706FF060504, %rax
vpinsrq $1, %rax, %xmm1, %xmm1
movabs $0xFF1B1A19FF191817, %rax
vmovq %rax, %xmm5
movabs $0xFF1F1E1DFF1D1C1B, %rax
vpinsrq $1, %rax, %xmm5, %xmm5
vinserti128 $1, %xmm5, %ymm1, %ymm1

/* Build the per-lane right-shift constant (srlvdidx) = (6,4,2,0,6,4,2,0)
* in %ymm2; the two 128-bit halves are identical. */
movabs $0x0000000200000000, %rax
vmovq %rax, %xmm2
movabs $0x0000000600000004, %rax
vpinsrq $1, %rax, %xmm2, %xmm2
vinserti128 $1, %xmm2, %ymm2, %ymm2

/* Build the 18-bit mask (0x3FFFF) broadcast into %ymm3. */
mov $0x3FFFF, %eax
vmovd %eax, %xmm3
vpbroadcastd %xmm3, %ymm3

/* Build gamma1 = 2^17 (0x20000) broadcast into %ymm4. */
mov $0x20000, %eax
vmovd %eax, %xmm4
vpbroadcastd %xmm4, %ymm4

unpack17_block 0
unpack17_block 1
unpack17_block 2
unpack17_block 3
unpack17_block 4
unpack17_block 5
unpack17_block 6
unpack17_block 7
unpack17_block 8
unpack17_block 9
unpack17_block 10
unpack17_block 11
unpack17_block 12
unpack17_block 13
unpack17_block 14
unpack17_block 15
unpack17_block 16
unpack17_block 17
unpack17_block 18
unpack17_block 19
unpack17_block 20
unpack17_block 21
unpack17_block 22
unpack17_block 23
unpack17_block 24
unpack17_block 25
unpack17_block 26
unpack17_block 27
unpack17_block 28
unpack17_block 29
unpack17_block 30
unpack17_block 31
ret

/* simpasm: footer-start */

#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \
!MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
*/
Loading
Loading