Add ML-DSA sign/verify

Reviewed-by: Viktor Dukhovni <viktor@openssl.org> Reviewed-by: Tim Hudson <tjh@openssl.org> Reviewed-by: Matt Caswell <matt@openssl.org> (Merged from https://github.com/openssl/openssl/pull/26127)
2025-03-31 20:10:45 +08:00 · 2024-12-20 14:18:27 +11:00 · 2024-12-20 14:18:27 +11:00 · 3ab7409f3d
commit 3ab7409f3d
parent efd7c96856
22 changed files with 2165 additions and 985 deletions
--- a/crypto/ml_dsa/build.info
+++ b/crypto/ml_dsa/build.info
@ -1,7 +1,8 @@
 LIBS=../../libcrypto

 $COMMON=ml_dsa_ctx.c ml_dsa_encoders.c ml_dsa_key_compress.c ml_dsa_key.c \
-        ml_dsa_matrix.c ml_dsa_ntt.c ml_dsa_params.c ml_dsa_sample.c
+        ml_dsa_matrix.c ml_dsa_ntt.c ml_dsa_params.c ml_dsa_sample.c \
+        ml_dsa_sign.c

 IF[{- !$disabled{'ml_dsa'} -}]
  SOURCE[../../libcrypto]=$COMMON
--- a/crypto/ml_dsa/ml_dsa_encoders.c
+++ b/crypto/ml_dsa/ml_dsa_encoders.c
@ -7,29 +7,104 @@
 * https://www.openssl.org/source/license.html
 */

+#include <assert.h>
 #include "ml_dsa_local.h"
 #include "ml_dsa_key.h"
 #include "ml_dsa_params.h"
+#include "ml_dsa_sign.h"
 #include "internal/packet.h"

-typedef int (PRIV_ENCODE_FN)(WPACKET *pkt, const POLY *s);
-typedef int (PRIV_DECODE_FN)(PACKET *pkt, POLY *s);
+typedef int (ENCODE_FN)(const POLY *s, WPACKET *pkt);
+typedef int (DECODE_FN)(POLY *s, PACKET *pkt);

-static PRIV_ENCODE_FN poly_encode_signed_2;
-static PRIV_ENCODE_FN poly_encode_signed_4;
-static PRIV_DECODE_FN poly_decode_signed_2;
-static PRIV_DECODE_FN poly_decode_signed_4;
-
-static ossl_inline int constant_time_declassify_int(int v)
-{
-    return value_barrier_32(v);
-}
+static ENCODE_FN poly_encode_signed_2;
+static ENCODE_FN poly_encode_signed_4;
+static ENCODE_FN poly_encode_signed_two_to_power_17;
+static ENCODE_FN poly_encode_signed_two_to_power_19;
+static DECODE_FN poly_decode_signed_2;
+static DECODE_FN poly_decode_signed_4;
+static DECODE_FN poly_decode_signed_two_to_power_17;
+static DECODE_FN poly_decode_signed_two_to_power_19;

 /* Bit packing Algorithms */

 /*
 * Encodes a polynomial into a byte string, assuming that all coefficients are
- * 10 bits.
+ * in the range 0..15 (4 bits).
+ *
+ * See FIPS 204, Algorithm 16, SimpleBitPack(w, b) where b = 4 bits
+ *
+ * i.e. Use 4 bits from each coefficient and pack them into bytes
+ * So every 2 coefficients fit into 1 byte.
+ *
+ * This is used to encode w1 when signing with ML-DSA-65 and ML-DSA-87
+ *
+ * @param p A polynomial with coefficients all in the range (0..15)
+ * @param pkt A packet object to write 128 bytes to.
+ *
+ * @returns 1 on success, or 0 on error.
+ */
+static int poly_encode_4_bits(const POLY *p, WPACKET *pkt)
+{
+    uint8_t *out;
+    const uint32_t *in = p->coeff, *end = in + ML_DSA_NUM_POLY_COEFFICIENTS;
+
+    if (!WPACKET_allocate_bytes(pkt, 32 * 4, &out))
+        return 0;
+
+    while (in < end) {
+        uint32_t z0 = *in++;
+        uint32_t z1 = *in++;
+
+        *out++ = z0 | (z1 << 4);
+    }
+    return 1;
+}
+
+/*
+ * Encodes a polynomial into a byte string, assuming that all coefficients are
+ * in the range 0..43 (6 bits).
+ *
+ * See FIPS 204, Algorithm 16, SimpleBitPack(w, b) where b = 43
+ *
+ * i.e. Use 6 bits from each coefficient and pack them into bytes
+ * So every 4 coefficients fit into 3 bytes.
+ *
+ *  |c0||c1||c2||c3|
+ *   |  /|  /\  /
+ *  |6 2|4 4|2 6|
+ *
+ * This is used to encode w1 when signing with ML-DSA-44
+ *
+ * @param p A polynomial with coefficients all in the range (0..43)
+ * @param pkt A packet object to write 96 bytes to.
+ *
+ * @returns 1 on success, or 0 on error.
+ */
+static int poly_encode_6_bits(const POLY *p, WPACKET *pkt)
+{
+    uint8_t *out;
+    const uint32_t *in = p->coeff, *end = in + ML_DSA_NUM_POLY_COEFFICIENTS;
+
+    if (!WPACKET_allocate_bytes(pkt, 32 * 3, &out))
+        return 0;
+
+    while (in < end) {
+        uint32_t c0 = *in++;
+        uint32_t c1 = *in++;
+        uint32_t c2 = *in++;
+        uint32_t c3 = *in++;
+
+        *out++ = c0 | (c1 << 6);
+        *out++ = c1 >> 4 | (c2 << 4);
+        *out++ = c3;
+    }
+    return 1;
+}
+
+/*
+ * Encodes a polynomial into a byte string, assuming that all coefficients are
+ * unsigned 10 bit values.
 *
 * See FIPS 204, Algorithm 16, SimpleBitPack(w, b) where b = 10 bits
 *
@ -46,12 +121,12 @@ static ossl_inline int constant_time_declassify_int(int v)
 *
 * @returns 1 on success, or 0 on error.
 */
-static int poly_encode_10_bits(WPACKET *pkt, const POLY *p)
+static int poly_encode_10_bits(const POLY *p, WPACKET *pkt)
 {
    uint8_t *out;
-    const uint32_t *in = p->coeff, *end = in + 256;
+    const uint32_t *in = p->coeff, *end = in + ML_DSA_NUM_POLY_COEFFICIENTS;

-    if (!WPACKET_allocate_bytes(pkt, 5 * (256 / 4), &out))
+    if (!WPACKET_allocate_bytes(pkt, 32 * 10, &out))
        return 0;

    while (in < end) {
@ -73,26 +148,28 @@ static int poly_encode_10_bits(WPACKET *pkt, const POLY *p)
 * @brief Reverses the procedure of poly_encode_10_bits().
 * See FIPS 204, Algorithm 18, SimpleBitUnpack(v, b) where b = 10.
 *
- * @param pkt A packet object to read 320 bytes from.
 * @param p A polynomial to write coefficients to.
+ * @param pkt A packet object to read 320 bytes from.
 *
 * @returns 1 on success, or 0 on error.
 */
-static int poly_decode_10_bits(PACKET *pkt, POLY *p)
+static int poly_decode_10_bits(POLY *p, PACKET *pkt)
 {
-    int i, ret = 0;
+    int ret = 0;
    const uint8_t *in = NULL;
-    uint32_t v, *out = p->coeff;
+    uint32_t v, mask = 0x3ff; /* 10 bits */
+    uint32_t *out = p->coeff, *end = out + ML_DSA_NUM_POLY_COEFFICIENTS;

-    for (i = 0; i < (ML_DSA_NUM_POLY_COEFFICIENTS / 4); i++) {
+    do {
        if (!PACKET_get_bytes(pkt, &in, 5))
            goto err;
-        memcpy(&v, in, sizeof(v));
-        *out++ = v & 0x3ff;
-        *out++ = (v >> 10) & 0x3ff;
-        *out++ = (v >> 20) & 0x3ff;
+        /* put first 4 bytes into v, 5th byte is accessed directly as in[4] */
+        memcpy(&v, in, 4);
+        *out++ = v & mask;
+        *out++ = (v >> 10) & mask;
+        *out++ = (v >> 20) & mask;
        *out++ = (v >> 30) | (((uint32_t)in[4]) << 2);
-    }
+    } while (out < end);
    ret = 1;
 err:
    return ret;
@ -109,15 +186,15 @@ err:
 * This is used to encode the private key polynomial elements of s1 and s2
 * for ML-DSA-65 (i.e. eta = 4)
 *
- * @param pkt A packet to write 128 bytes of encoded polynomial coefficients to.
 * @param p An array of 256 coefficients all in the range -4..4
+ * @param pkt A packet to write 128 bytes of encoded polynomial coefficients to.
 *
 * @returns 1 on success, or 0 on error.
 */
-static int poly_encode_signed_4(WPACKET *pkt, const POLY *p)
+static int poly_encode_signed_4(const POLY *p, WPACKET *pkt)
 {
    uint8_t *out;
-    const uint32_t *in = p->coeff, *end = in + 256;
+    const uint32_t *in = p->coeff, *end = in + ML_DSA_NUM_POLY_COEFFICIENTS;

    if (!WPACKET_allocate_bytes(pkt, 32 * 4, &out))
        return 0;
@ -135,23 +212,23 @@ static int poly_encode_signed_4(WPACKET *pkt, const POLY *p)
 * @brief Reverses the procedure of poly_encode_signed_4().
 * See FIPS 204, Algorithm 19, BitUnpack(v, a, b) where a = b = 4.
 *
- * @param pkt A packet object to read 128 bytes from.
 * @param p A polynomial to write coefficients to.
+ * @param pkt A packet object to read 128 bytes from.
 *
 * @returns 1 on success, or 0 on error. An error will occur if any of the
 *          coefficients are not in the correct range.
 */
-static int poly_decode_signed_4(PACKET *pkt, POLY *s)
+static int poly_decode_signed_4(POLY *p, PACKET *pkt)
 {
    int i, ret = 0;
-    uint32_t v, *out = s->coeff;
+    uint32_t v, *out = p->coeff;
    const uint8_t *in;
    uint32_t msbs, mask;

    for (i = 0; i < (ML_DSA_NUM_POLY_COEFFICIENTS / 8); i++) {
        if (!PACKET_get_bytes(pkt, &in, 4))
            goto err;
-        memcpy(&v, &in, 4);
+        memcpy(&v, in, 4);

        /*
         * None of the nibbles may be >= 9. So if the MSB of any nibble is set,
@ -164,7 +241,7 @@ static int poly_decode_signed_4(PACKET *pkt, POLY *s)
         * A nibble is only out of range in the case of invalid input, in which case
         * it is okay to leak the value.
         */
-        if (constant_time_declassify_int((mask & v) != 0))
+        if (value_barrier_32((mask & v) != 0))
            goto err;

        *out++ = mod_sub(4, v & 15);
@ -198,15 +275,15 @@ static int poly_decode_signed_4(PACKET *pkt, POLY *s)
 *   | /  / | |  / / | |  /
 *  |3 3 2| 1 3 3 1| 2 3 3|
 *
- * @param pkt A packet to write 64 bytes of encoded polynomial coefficients to.
 * @param p An array of 256 coefficients all in the range -2..2
+ * @param pkt A packet to write 64 bytes of encoded polynomial coefficients to.
 *
 * @returns 1 on success, or 0 on error.
 */
-static int poly_encode_signed_2(WPACKET *pkt, const POLY *s)
+static int poly_encode_signed_2(const POLY *p, WPACKET *pkt)
 {
    uint8_t *out;
-    const uint32_t *in = s->coeff, *end = in + 256;
+    const uint32_t *in = p->coeff, *end = in + ML_DSA_NUM_POLY_COEFFICIENTS;

    if (!WPACKET_allocate_bytes(pkt, 32 * 3, &out))
        return 0;
@ -232,13 +309,13 @@ static int poly_encode_signed_2(WPACKET *pkt, const POLY *s)
 * @brief Reverses the procedure of poly_encode_signed_2().
 * See FIPS 204, Algorithm 19, BitUnpack(v, a, b) where a = b = 2.
 *
- * @param pkt A packet object to read 64 encoded bytes from.
 * @param p A polynomial to write coefficients to.
+ * @param pkt A packet object to read 64 encoded bytes from.
 *
 * @returns 1 on success, or 0 on error. An error will occur if any of the
 *          coefficients are not in the correct range.
 */
-static int poly_decode_signed_2(PACKET *pkt, POLY *p)
+static int poly_decode_signed_2(POLY *p, PACKET *pkt)
 {
    int i, ret = 0;
    uint32_t v = 0, *out = p->coeff;
@ -248,7 +325,7 @@ static int poly_decode_signed_2(PACKET *pkt, POLY *p)
    for (i = 0; i < (ML_DSA_NUM_POLY_COEFFICIENTS / 8); i++) {
        if (!PACKET_get_bytes(pkt, &in, 3))
            goto err;
-        memcpy(&v, &in, 3);
+        memcpy(&v, in, 3);
        /*
         * Each octal value (3 bits) must be <= 4, So if the MSB is set then the
         * bottom 2 bits must not be set.
@ -261,7 +338,7 @@ static int poly_decode_signed_2(PACKET *pkt, POLY *p)
         * A nibble is only out of range in the case of invalid input, in which
         * case it is okay to leak the value.
         */
-        if (constant_time_declassify_int((mask & v) != 0))
+        if (value_barrier_32((mask & v) != 0))
            goto err;

        *out++ = mod_sub(2, v & 7);
@ -293,16 +370,16 @@ static int poly_decode_signed_2(PACKET *pkt, POLY *p)
 *  |   |  | |   / \  |  |  |  |
 * |13 13 13 13 12 |1 13 13 13 24
 *
+ * @param p An array of 256 coefficients all in the range -2^12+1..2^12
 * @param pkt A packet to write 416 (13 * 256 / 8) bytes of encoded polynomial
 *            coefficients to.
- * @param p An array of 256 coefficients all in the range -2^12+1..2^12
 *
 * @returns 1 on success, or 0 on error.
 */
-static int poly_encode_signed_two_to_power_12(WPACKET *pkt, const POLY *p)
+static int poly_encode_signed_two_to_power_12(const POLY *p, WPACKET *pkt)
 {
    static const uint32_t range = 1u << 12;
-    const uint32_t *in = p->coeff, *end = in + 256;
+    const uint32_t *in = p->coeff, *end = in + ML_DSA_NUM_POLY_COEFFICIENTS;

    while (in < end) {
        uint64_t z0 = mod_sub(range, *in++); /* < 2^13 */
@ -327,12 +404,12 @@ static int poly_encode_signed_two_to_power_12(WPACKET *pkt, const POLY *p)
 * @brief Reverses the procedure of poly_encode_signed_two_to_power_12().
 * See FIPS 204, Algorithm 19, BitUnpack(v, a, b) where a = 2^12 - 1, b = 2^12.
 *
- * @param pkt A packet object to read 416 encoded bytes from.
 * @param p A polynomial to write coefficients to.
+ * @param pkt A packet object to read 416 encoded bytes from.
 *
 * @returns 1 on success, or 0 on error.
 */
-static int poly_decode_signed_two_to_power_12(PACKET *pkt, POLY *p)
+static int poly_decode_signed_two_to_power_12(POLY *p, PACKET *pkt)
 {
    int i, ret = 0;
    uint64_t a1 = 0, a2 = 0;
@ -347,10 +424,6 @@ static int poly_decode_signed_two_to_power_12(PACKET *pkt, POLY *p)
        memcpy(&a1, in, 8);
        memcpy(&a2, in + 8, 5);

-        /*
-         * It is not possible for a 13-bit number to be out of range when the
-         * max is 2^12.
-         */
        *out++ = mod_sub(range, a1 & mask_13_bits);
        *out++ = mod_sub(range, (a1 >> 13) & mask_13_bits);
        *out++ = mod_sub(range, (a1 >> 26) & mask_13_bits);
@ -365,6 +438,165 @@ static int poly_decode_signed_two_to_power_12(PACKET *pkt, POLY *p)
    return ret;
 }

+/*
+ * @brief Encodes a polynomial into a byte string, assuming that all
+ * coefficients are in the range (-2^19 + 1)..2^19.
+ * See FIPS 204, Algorithm 17, BitPack(w, a, b). where a = 2^19 - 1, b = 2^19.
+ *
+ * This is used to encode signatures for ML-DSA-65 & ML-DSA-87 (gamma1 = 2^19)
+ *
+ * Use 20 bits from each coefficient and pack them into bytes
+ *
+ * The code below packs every 4 (20 bit) coefficients into 10 bytes
+ *  z0  z1  z2 z3
+ *  |   |\  |  | \
+ * |20 12|8 20 4|16
+ *
+ * @param p An array of 256 coefficients all in the range -2^19+1..2^19
+ * @param pkt A packet to write 640 (20 * 256 / 8) bytes of encoded polynomial
+ *            coefficients to.
+ *
+ * @returns 1 on success, or 0 on error.
+ */
+static int poly_encode_signed_two_to_power_19(const POLY *p, WPACKET *pkt)
+{
+    static const uint32_t range = 1u << 19;
+    const uint32_t *in = p->coeff, *end = in + ML_DSA_NUM_POLY_COEFFICIENTS;
+
+    while (in < end) {
+        uint32_t z0 = mod_sub(range, *in++); /* < 2^20 */
+        uint32_t z1 = mod_sub(range, *in++);
+        uint32_t z2 = mod_sub(range, *in++);
+        uint32_t z3 = mod_sub(range, *in++);
+
+        z0 |= (z1 << 20);
+        z1 >>= 12;
+        z1 |= (z2 << 8) | (z3 << 28);
+        z3 >>= 4;
+
+        if (!WPACKET_memcpy(pkt, &z0, sizeof(z0))
+                || !WPACKET_memcpy(pkt, &z1, sizeof(z1))
+                || !WPACKET_memcpy(pkt, &z3, 2))
+            return 0;
+    }
+    return 1;
+}
+
+/*
+ * @brief Reverses the procedure of poly_encode_signed_two_to_power_19().
+ * See FIPS 204, Algorithm 19, BitUnpack(v, a, b) where a = 2^19 - 1, b = 2^19.
+ *
+ * @param p A polynomial to write coefficients to.
+ * @param pkt A packet object to read 640 encoded bytes from.
+ *
+ * @returns 1 on success, or 0 on error.
+ */
+static int poly_decode_signed_two_to_power_19(POLY *p, PACKET *pkt)
+{
+    int i, ret = 0;
+    uint32_t a1, a2, a3 = 0;
+    uint32_t *out = p->coeff;
+    const uint8_t *in;
+    static const uint32_t range = 1u << 19;
+    static const uint32_t mask_20_bits = (1u << 20) - 1;
+
+    for (i = 0; i < (ML_DSA_NUM_POLY_COEFFICIENTS / 4); i++) {
+        if (!PACKET_get_bytes(pkt, &in, 10))
+            goto err;
+        memcpy(&a1, in, 4);
+        memcpy(&a2, in + 4, 4);
+        memcpy(&a3, in + 8, 2);
+
+        *out++ = mod_sub(range, a1 & mask_20_bits);
+        *out++ = mod_sub(range, (a1 >> 20) | ((a2 & 0xFF) << 12));
+        *out++ = mod_sub(range, (a2 >> 8) & mask_20_bits);
+        *out++ = mod_sub(range, (a2 >> 28) | (a3 << 4));
+    }
+    ret = 1;
+ err:
+    return ret;
+}
+
+/*
+ * @brief Encodes a polynomial into a byte string, assuming that all
+ * coefficients are in the range (-2^17 + 1)..2^17.
+ * See FIPS 204, Algorithm 17, BitPack(w, a, b). where a = 2^17 - 1, b = 2^17.
+ *
+ * This is used to encode signatures for ML-DSA-44 (where gamma1 = 2^17)
+ *
+ * Use 18 bits from each coefficient and pack them into bytes
+ *
+ * The code below packs every 4 (18 bit) coefficients into 9 bytes
+ *  z0  z1  z2 z3
+ *  |   |\  |  | \
+ * |18 14|4 18 10| 8
+ *
+ * @param p An array of 256 coefficients all in the range -2^17+1..2^17
+ * @param pkt A packet to write 576 (18 * 256 / 8) bytes of encoded polynomial
+ *            coefficients to.
+ *
+ * @returns 1 on success, or 0 on error.
+ */
+static int poly_encode_signed_two_to_power_17(const POLY *p, WPACKET *pkt)
+{
+    static const uint32_t range = 1u << 17;
+    const uint32_t *in = p->coeff, *end = in + ML_DSA_NUM_POLY_COEFFICIENTS;
+
+    while (in < end) {
+        uint32_t z0 = mod_sub(range, *in++); /* < 2^18 */
+        uint32_t z1 = mod_sub(range, *in++);
+        uint32_t z2 = mod_sub(range, *in++);
+        uint32_t z3 = mod_sub(range, *in++);
+
+        z0 |= (z1 << 18);
+        z1 >>= 14;
+        z1 |= (z2 << 4) | (z3 << 22);
+        z3 >>= 10;
+
+        if (!WPACKET_memcpy(pkt, &z0, sizeof(z0))
+                || !WPACKET_memcpy(pkt, &z1, sizeof(z1))
+                || !WPACKET_memcpy(pkt, &z3, 1))
+            return 0;
+    }
+    return 1;
+}
+
+/*
+ * @brief Reverses the procedure of poly_encode_signed_two_to_power_17().
+ * See FIPS 204, Algorithm 19, BitUnpack(v, a, b) where a = 2^17 - 1, b = 2^17.
+ *
+ * @param p A polynomial to write coefficients to.
+ * @param pkt A packet object to read 576 encoded bytes from.
+ *
+ * @returns 1 on success, or 0 on error.
+ */
+static int poly_decode_signed_two_to_power_17(POLY *p, PACKET *pkt)
+{
+    int ret = 0;
+    uint32_t a1, a2, a3 = 0;
+    uint32_t *out = p->coeff;
+    const uint32_t *end = out + ML_DSA_NUM_POLY_COEFFICIENTS;
+    const uint8_t *in;
+    static const uint32_t range = 1u << 17;
+    static const uint32_t mask_18_bits = (1u << 18) - 1;
+
+    while (out < end) {
+        if (!PACKET_get_bytes(pkt, &in, 10))
+            goto err;
+        memcpy(&a1, in, 4);
+        memcpy(&a2, in + 4, 4);
+        memcpy(&a3, in + 8, 1);
+
+        *out++ = mod_sub(range, a1 & mask_18_bits);
+        *out++ = mod_sub(range, (a1 >> 18) | ((a2 & 0xF) << 14));
+        *out++ = mod_sub(range, (a2 >> 4) & mask_18_bits);
+        *out++ = mod_sub(range, (a2 >> 22) | (a3 << 10));
+    }
+    ret = 1;
+ err:
+    return ret;
+}
+
 /*
 * @brief Encode the public key as an array of bytes.
 * See FIPS 204, Algorithm 22, pkEncode().
@ -390,7 +622,7 @@ int ossl_ml_dsa_pk_encode(ML_DSA_KEY *key)
            || !WPACKET_memcpy(&pkt, key->rho, sizeof(key->rho)))
        goto err;
    for (i = 0; i < t1_len; i++)
-        if (!poly_encode_10_bits(&pkt, t1 + i))
+        if (!poly_encode_10_bits(t1 + i, &pkt))
            goto err;
    OPENSSL_free(key->pub_encoding);
    key->pub_encoding = enc;
@ -412,7 +644,7 @@ err:
 *
 * @returns 1 if the public key was decoded successfully or 0 otherwise.
 */
-int ossl_ml_dsa_pk_decode(const uint8_t *in, size_t in_len, ML_DSA_KEY *key)
+int ossl_ml_dsa_pk_decode(ML_DSA_KEY *key, const uint8_t *in, size_t in_len)
 {
    int ret = 0;
    size_t i;
@ -425,7 +657,7 @@ int ossl_ml_dsa_pk_decode(const uint8_t *in, size_t in_len, ML_DSA_KEY *key)
            || PACKET_copy_bytes(&pkt, key->rho, sizeof(key->rho)))
        goto err;
    for (i = 0; i < key->t1.num_poly; i++)
-        if (!poly_decode_10_bits(&pkt, &key->t1.poly[i]))
+        if (!poly_decode_10_bits(key->t1.poly + i, &pkt))
            goto err;
    memcpy(key->pub_encoding, in, in_len);
    ret = 1;
@ -446,7 +678,7 @@ int ossl_ml_dsa_sk_encode(ML_DSA_KEY *key)
    int ret = 0;
    const ML_DSA_PARAMS *params = key->params;
    size_t i, k = params->k, l = params->l;
-    PRIV_ENCODE_FN *encode_fn;
+    ENCODE_FN *encode_fn;
    size_t enc_len = params->sk_len;
    const POLY *t0 = key->t0.poly;
    WPACKET pkt;
@ -456,7 +688,7 @@ int ossl_ml_dsa_sk_encode(ML_DSA_KEY *key)
        return 0;

    /* eta is the range of private key coefficients (-eta...eta) */
-    if (params->eta == 4)
+    if (params->eta == ML_DSA_ETA_4)
        encode_fn = poly_encode_signed_4;
    else
        encode_fn = poly_encode_signed_2;
@ -467,13 +699,13 @@ int ossl_ml_dsa_sk_encode(ML_DSA_KEY *key)
            || !WPACKET_memcpy(&pkt, key->tr, sizeof(key->tr)))
        goto err;
    for (i = 0; i < l; ++i)
-        if (!encode_fn(&pkt, &key->s1.poly[i]))
+        if (!encode_fn(key->s1.poly + i, &pkt))
            goto err;
    for (i = 0; i < k; ++i)
-        if (!encode_fn(&pkt, &key->s2.poly[i]))
+        if (!encode_fn(key->s2.poly + i, &pkt))
            goto err;
-    for (i = 0; i < k; ++i, t0++)
-        if (!poly_encode_signed_two_to_power_12(&pkt, t0))
+    for (i = 0; i < k; ++i)
+        if (!poly_encode_signed_two_to_power_12(t0++, &pkt))
            goto err;
    OPENSSL_clear_free(key->priv_encoding, enc_len);
    key->priv_encoding = enc;
@ -495,11 +727,11 @@ err:
 *
 * @returns 1 if the private key was decoded successfully or 0 otherwise.
 */
-int ossl_ml_dsa_sk_decode(const uint8_t *in, size_t in_len, ML_DSA_KEY *key)
+int ossl_ml_dsa_sk_decode(ML_DSA_KEY *key, const uint8_t *in, size_t in_len)
 {
    int ret = 0;
    uint8_t *enc = NULL;
-    PRIV_DECODE_FN *decode_fn;
+    DECODE_FN *decode_fn;
    const ML_DSA_PARAMS *params = key->params;
    size_t i, k = params->k, l = params->l;
    PACKET pkt;
@ -511,7 +743,7 @@ int ossl_ml_dsa_sk_decode(const uint8_t *in, size_t in_len, ML_DSA_KEY *key)
        return 0;

    /* eta is the range of private key coefficients (-eta...eta) */
-    if (params->eta == 4)
+    if (params->eta == ML_DSA_ETA_4)
        decode_fn = poly_decode_signed_4;
    else
        decode_fn = poly_decode_signed_2;
@ -523,13 +755,13 @@ int ossl_ml_dsa_sk_decode(const uint8_t *in, size_t in_len, ML_DSA_KEY *key)
        goto err;

    for (i = 0; i < l; ++i)
-        if (!decode_fn(&pkt, key->s1.poly + i))
+        if (!decode_fn(key->s1.poly + i, &pkt))
            goto err;
    for (i = 0; i < k; ++i)
-        if (!decode_fn(&pkt, key->s2.poly + i))
+        if (!decode_fn(key->s2.poly + i, &pkt))
            goto err;
    for (i = 0; i < k; ++i)
-        if (!poly_decode_signed_two_to_power_12(&pkt, key->t0.poly + i))
+        if (!poly_decode_signed_two_to_power_12(key->t0.poly + i, &pkt))
            goto err;
    if (PACKET_remaining(&pkt) != 0)
        goto err;
@ -539,3 +771,196 @@ int ossl_ml_dsa_sk_decode(const uint8_t *in, size_t in_len, ML_DSA_KEY *key)
 err:
    return ret;
 }
+
+/*
+ * See FIPS 204, Algorithm 20, HintBitPack().
+ * Hint is composed of k polynomials with binary coefficients where only 'omega'
+ * of all the coefficients are set to 1.
+ * This can be encoded as a byte array of 'omega' polynomial coefficient index
+ * positions for the coefficients that are set, followed by
+ * k values of the last coefficient index used in each polynomial.
+ */
+static int hint_bits_encode(const VECTOR *hint, WPACKET *pkt, uint32_t omega)
+{
+    int i, j, k = hint->num_poly;
+    size_t coeff_index = 0;
+    POLY *p = hint->poly;
+    uint8_t *data;
+
+    if (!WPACKET_allocate_bytes(pkt, omega + k, &data))
+        return 0;
+
+    for (i = 0; i < k; i++, p++) {
+        for (j = 0; j < ML_DSA_NUM_POLY_COEFFICIENTS; j++)
+            if (p->coeff[j] != 0) {
+                assert(coeff_index < omega);
+                data[coeff_index++] = j;
+            }
+        data[omega + i] = (uint8_t)coeff_index;
+    }
+    return 1;
+}
+
+/*
+ * @brief Reverse the process of hint_bits_encode()
+ * See FIPS 204, Algorithm 21, HintBitUnpack()
+ *
+ * @returns 1 if the hints were successfully unpacked, or 0
+ * if 'pkt' is too small or malformed.
+ */
+static int hint_bits_decode(VECTOR *hint, PACKET *pkt, uint32_t omega)
+{
+    size_t coeff_index = 0, k = hint->num_poly;
+    const uint8_t *in, *limits;
+    POLY *p = hint->poly, *end = p + k;
+
+    if (!PACKET_get_bytes(pkt, &in, omega)
+            || !PACKET_get_bytes(pkt, &limits, k))
+        return 0;
+
+    vector_zero(hint); /* Set all coefficients to zero */
+
+    do {
+        const uint32_t limit = *limits++;
+        int last = -1;
+
+        if (limit < coeff_index || limit > omega)
+            return 0;
+
+        while (coeff_index < limit) {
+            int byte = in[coeff_index++];
+
+            if (last >= 0 && byte <= last)
+                return 0;
+            last = byte;
+            p->coeff[byte] = 1;
+        }
+    } while (++p < end);
+
+    for (; coeff_index < omega; coeff_index++)
+        if (in[coeff_index] != 0)
+            return 0;
+    return 1;
+}
+
+/*
+ * @brief Encode a ML_DSA signature as an array of bytes.
+ * See FIPS 204, Algorithm 26, sigEncode().
+ *
+ * @param
+ * @param
+ * @returns 1 if the signature was encoded successfully or 0 otherwise.
+ */
+int ossl_ml_dsa_sig_encode(const ML_DSA_SIG *sig, const ML_DSA_PARAMS *params,
+                           uint8_t *out)
+{
+    int ret = 0;
+    size_t i;
+    ENCODE_FN *encode_fn;
+    WPACKET pkt;
+
+    if (out == NULL)
+        return 0;
+
+    if (params->gamma1 == ML_DSA_GAMMA1_TWO_POWER_19)
+        encode_fn = poly_encode_signed_two_to_power_19;
+    else
+        encode_fn = poly_encode_signed_two_to_power_17;
+
+    if (!WPACKET_init_static_len(&pkt, out, params->sig_len, 0)
+            || !WPACKET_memcpy(&pkt, sig->c_tilde, sig->c_tilde_len))
+        goto err;
+
+    for (i = 0; i < sig->z.num_poly; ++i)
+        if (!encode_fn(sig->z.poly + i, &pkt))
+            goto err;
+    if (!hint_bits_encode(&sig->hint, &pkt, params->omega))
+        goto err;
+    ret = 1;
+err:
+    WPACKET_finish(&pkt);
+    return ret;
+}
+
+/*
+ * @param sig is a initialized signature object to decode into.
+ * @param in An encoded signature
+ * @param in_len The size of |in|
+ * @param params contains constants for an ML-DSA algorithm (such as gamma1)
+ * @returns 1 if the signature was successfully decoded or 0 otherwise.
+ */
+int ossl_ml_dsa_sig_decode(ML_DSA_SIG *sig, const uint8_t *in, size_t in_len,
+                           const ML_DSA_PARAMS *params)
+{
+    int ret = 0;
+    size_t i;
+    DECODE_FN *decode_fn;
+    PACKET pkt;
+
+    if (params->gamma1 == ML_DSA_GAMMA1_TWO_POWER_19)
+        decode_fn = poly_decode_signed_two_to_power_19;
+    else
+        decode_fn = poly_decode_signed_two_to_power_17;
+
+    if (!PACKET_buf_init(&pkt, in, in_len)
+            || !PACKET_copy_bytes(&pkt, sig->c_tilde, sig->c_tilde_len))
+        goto err;
+    for (i = 0; i < sig->z.num_poly; ++i)
+        if (!decode_fn(sig->z.poly + i, &pkt))
+            goto err;
+
+    if (!hint_bits_decode(&sig->hint, &pkt, params->omega)
+            || PACKET_remaining(&pkt) != 0)
+        goto err;
+    ret = 1;
+err:
+    return ret;
+}
+
+int ossl_ml_dsa_poly_decode_expand_mask(POLY *out,
+                                        const uint8_t *in, size_t in_len,
+                                        uint32_t gamma1)
+{
+    PACKET pkt;
+
+    if (!PACKET_buf_init(&pkt, in, in_len))
+        return 0;
+    if (gamma1 == ML_DSA_GAMMA1_TWO_POWER_19)
+        return poly_decode_signed_two_to_power_19(out, &pkt);
+    else
+        return poly_decode_signed_two_to_power_17(out, &pkt);
+}
+
+/*
+ * @brief Encode a polynomial vector as an array of bytes.
+ * Where the polynomial coefficients have a range of [0..15] or [0..43]
+ * depending on the value of gamma2.
+ *
+ * See FIPS 204, Algorithm 28, w1Encode().
+ *
+ * @param w1 The vector to convert to bytes
+ * @param gamma2 either ML_DSA_GAMMA2_Q_MINUS1_DIV32 or ML_DSA_GAMMA2_Q_MINUS1_DIV88
+ * @returns 1 if the signature was encoded successfully or 0 otherwise.
+ */
+int ossl_ml_dsa_w1_encode(const VECTOR *w1, uint32_t gamma2,
+                          uint8_t *out, size_t out_len)
+{
+    WPACKET pkt;
+    ENCODE_FN *encode_fn;
+    int ret = 0;
+    size_t i;
+
+    if (!WPACKET_init_static_len(&pkt, out, out_len, 0))
+        return 0;
+    if (gamma2 == ML_DSA_GAMMA2_Q_MINUS1_DIV32)
+        encode_fn = poly_encode_4_bits;
+    else
+        encode_fn = poly_encode_6_bits;
+    for (i = 0; i < w1->num_poly; ++i)
+        if (!encode_fn(w1->poly + i, &pkt))
+            goto err;
+    ret = 1;
+err:
+    WPACKET_finish(&pkt);
+    return ret;
+}
--- a/crypto/ml_dsa/ml_dsa_hash.h
+++ b/crypto/ml_dsa/ml_dsa_hash.h
@ -0,0 +1,41 @@
+/*
+ * Copyright 2024 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include <openssl/evp.h>
+
+static ossl_inline ossl_unused int
+shake_xof(EVP_MD_CTX *ctx, const uint8_t *in, size_t in_len,
+          uint8_t *out, size_t out_len)
+{
+    return (EVP_DigestInit_ex2(ctx, NULL, NULL) == 1
+            && EVP_DigestUpdate(ctx, in, in_len) == 1
+            && EVP_DigestSqueeze(ctx, out, out_len) == 1);
+}
+
+static ossl_inline ossl_unused int
+shake_xof_2(EVP_MD_CTX *ctx, const uint8_t *in1, size_t in1_len,
+            const uint8_t *in2, size_t in2_len, uint8_t *out, size_t out_len)
+{
+    return EVP_DigestInit_ex2(ctx, NULL, NULL)
+        && EVP_DigestUpdate(ctx, in1, in1_len)
+        && EVP_DigestUpdate(ctx, in2, in2_len)
+        && EVP_DigestSqueeze(ctx, out, out_len);
+}
+
+static ossl_inline ossl_unused int
+shake_xof_3(EVP_MD_CTX *ctx, const uint8_t *in1, size_t in1_len,
+            const uint8_t *in2, size_t in2_len,
+            const uint8_t *in3, size_t in3_len, uint8_t *out, size_t out_len)
+{
+    return EVP_DigestInit_ex2(ctx, NULL, NULL)
+        && EVP_DigestUpdate(ctx, in1, in1_len)
+        && EVP_DigestUpdate(ctx, in2, in2_len)
+        && EVP_DigestUpdate(ctx, in3, in3_len)
+        && EVP_DigestSqueeze(ctx, out, out_len);
+}
--- a/crypto/ml_dsa/ml_dsa_key.c
+++ b/crypto/ml_dsa/ml_dsa_key.c
@ -15,6 +15,7 @@
 #include "ml_dsa_key.h"
 #include "ml_dsa_params.h"
 #include "ml_dsa_matrix.h"
+#include "ml_dsa_hash.h"

 /**
 * @brief Create a new ML_DSA_KEY object
@ -26,15 +27,15 @@
 ML_DSA_KEY *ossl_ml_dsa_key_new(OSSL_LIB_CTX *libctx, const char *alg)
 {
    ML_DSA_KEY *ret;
-    size_t sz;
+    size_t poly_sz;
    const ML_DSA_PARAMS *params = ossl_ml_dsa_params_get(alg);
    POLY *poly;

    if (params == NULL)
        return NULL;

-    sz = sizeof(POLY) * (params->k * 3 + params->l);
-    ret = OPENSSL_zalloc(sizeof(*ret) + sz);
+    poly_sz = sizeof(POLY) * (params->k * 3 + params->l);
+    ret = OPENSSL_zalloc(sizeof(*ret) + poly_sz);
    if (ret != NULL) {
        if (!CRYPTO_NEW_REF(&ret->references, 1)) {
            OPENSSL_free(ret);
@ -105,21 +106,26 @@ int ossl_ml_dsa_key_equal(const ML_DSA_KEY *key1, const ML_DSA_KEY *key2,
 {
    if (key1->params != key2->params)
        return 0;
-    if (key1->pub_encoding != NULL) {
-        if (key2->pub_encoding == NULL
-                || memcmp(key1->pub_encoding, key1->pub_encoding,
-                          key1->params->pk_len) != 0)
+
+    if ((selection & OSSL_KEYMGMT_SELECT_KEYPAIR) != 0) {
+        if ((selection & OSSL_KEYMGMT_SELECT_PUBLIC_KEY) != 0) {
+            if (key1->pub_encoding != NULL) {
+                if (key2->pub_encoding == NULL
+                        || memcmp(key1->pub_encoding, key2->pub_encoding,
+                                  key1->params->pk_len) != 0)
+                    return 0;
+            } else if (key2->pub_encoding != NULL) {
+                return 0;
+            }
+        }
+        if (key1->priv_encoding != NULL) {
+            if (key2->priv_encoding == NULL
+                    || memcmp(key1->priv_encoding, key2->priv_encoding,
+                              key1->params->sk_len) != 0)
+                return 0;
+        } else if (key2->priv_encoding != NULL) {
            return 0;
-    } else if (key2->pub_encoding != NULL) {
-        return 0;
-    }
-    if (key1->priv_encoding != NULL) {
-        if (key2->priv_encoding == NULL
-                || memcmp(key1->priv_encoding, key1->priv_encoding,
-                          key1->params->sk_len) != 0)
-            return 0;
-    } else if (key2->priv_encoding != NULL) {
-        return 0;
+        }
    }
    return 1;
 }
@ -156,14 +162,14 @@ int ossl_ml_dsa_key_fromdata(ML_DSA_KEY *key, const OSSL_PARAM params[],
        p = OSSL_PARAM_locate_const(params, OSSL_PKEY_PARAM_PRIV_KEY);
        if (p != NULL) {
            if (p->data_type != OSSL_PARAM_OCTET_STRING
-                    || !ossl_ml_dsa_sk_decode(p->data, p->data_size, key))
+                    || !ossl_ml_dsa_sk_decode(key, p->data, p->data_size))
                return 0;
        }
    }
    p = OSSL_PARAM_locate_const(params, OSSL_PKEY_PARAM_PUB_KEY);
    if (p != NULL) {
        if (p->data_type != OSSL_PARAM_OCTET_STRING
-                || !ossl_ml_dsa_pk_decode(p->data, p->data_size, key))
+                || !ossl_ml_dsa_pk_decode(key, p->data, p->data_size))
            return 0;
    }
    return 1;
@ -181,28 +187,33 @@ int ossl_ml_dsa_key_fromdata(ML_DSA_KEY *key, const OSSL_PARAM params[],
 *        of the uncompressed public key polynomial t.
 * @returns 1 on success, or 0 on failure.
 */
-static int public_from_private(ML_DSA_CTX *ctx, const ML_DSA_KEY *key,
+static int public_from_private(const ML_DSA_KEY *key, EVP_MD_CTX *g_ctx,
                               VECTOR *t1, VECTOR *t0)
 {
-    const ML_DSA_PARAMS *params = ctx->params;
-    POLY polys[ML_DSA_K_MAX + ML_DSA_L_MAX + ML_DSA_K_MAX * ML_DSA_L_MAX];
+    const ML_DSA_PARAMS *params = key->params;
+    uint32_t k = params->k, l = params->l;
+    POLY *polys;
    MATRIX a_ntt;
    VECTOR s1_ntt;
    VECTOR t;

-    vector_init(&t, polys, params->k);
-    vector_init(&s1_ntt, polys + params->k, params->l);
-    matrix_init(&a_ntt, polys + params->k + params->l, params->k, params->l);
+    polys = OPENSSL_malloc(sizeof(*polys) * (k + l + k * l));
+    if (polys == NULL)
+        return 0;
+
+    vector_init(&t, polys, k);
+    vector_init(&s1_ntt, t.poly + k, l);
+    matrix_init(&a_ntt, s1_ntt.poly + l, k, l);

    /* Using rho generate A' = A in NTT form */
-    if (!ossl_ml_dsa_sample_expandA(ctx->g_ctx, key->rho, &a_ntt))
-        return 0;
+    if (!matrix_expand_A(g_ctx, key->rho, &a_ntt))
+        goto err;

    /* t = NTT_inv(A' * NTT(s1)) + s2 */
    vector_copy(&s1_ntt, &key->s1);
    vector_ntt(&s1_ntt);

-    ossl_ml_dsa_matrix_mult_vector(&a_ntt, &s1_ntt, &t);
+    matrix_mult_vector(&a_ntt, &s1_ntt, &t);
    vector_ntt_inverse(&t);
    vector_add(&t, &key->s2, &t);

@ -211,6 +222,8 @@ static int public_from_private(ML_DSA_CTX *ctx, const ML_DSA_KEY *key,

    /* Zeroize secret */
    vector_zero(&s1_ntt);
+err:
+    OPENSSL_free(polys);
    return 1;
 }

@ -219,32 +232,42 @@ int ossl_ml_dsa_key_pairwise_check(const ML_DSA_KEY *key)
    int ret = 0;
    ML_DSA_CTX *ctx = NULL;
    VECTOR t1, t0;
-    POLY polys[ML_DSA_K_MAX * 2];
+    POLY *polys = NULL;
+    uint32_t k = key->params->k;

    if (key->pub_encoding == NULL || key->priv_encoding == 0)
        return 0;

+    polys = OPENSSL_malloc(sizeof(*polys) * (2 * k));
+    if (polys == NULL)
+        return 0;
+
    ctx = ossl_ml_dsa_ctx_new(key->params->alg, key->libctx, key->propq);
    if (ctx == NULL)
        return 0;

-    vector_init(&t1, polys, key->params->k);
-    vector_init(&t0, polys + key->params->k, key->params->k);
-    if (!public_from_private(ctx, key, &t1, &t0))
+    vector_init(&t1, polys, k);
+    vector_init(&t0, polys + k, k);
+    if (!public_from_private(key, ctx->g_ctx, &t1, &t0))
        goto err;

    ret = vector_equal(&t1, &key->t1) && vector_equal(&t0, &key->t0);
 err:
    ossl_ml_dsa_ctx_free(ctx);
+    OPENSSL_free(polys);
    return ret;
 }

-static int shake_xof(EVP_MD_CTX *ctx, const uint8_t *in, size_t in_len,
-                     uint8_t *out, size_t out_len)
+int ossl_ml_dsa_key_public_from_private(ML_DSA_KEY *key, ML_DSA_CTX *ctx)
 {
-    return (EVP_DigestInit_ex2(ctx, NULL, NULL) == 1
-            && EVP_DigestUpdate(ctx, in, in_len) == 1
-            && EVP_DigestFinalXOF(ctx, out, out_len) == 1);
+    if (key->pub_encoding != NULL)
+        return 1;
+    if (key->priv_encoding == NULL)
+        return 0;
+    return public_from_private(key, ctx->g_ctx, &key->t1, &key->t0)
+        && ossl_ml_dsa_pk_encode(key)
+        && shake_xof(ctx->h_ctx, key->pub_encoding, key->params->pk_len,
+                     key->tr, sizeof(key->tr));
 }

 /*
@ -280,9 +303,8 @@ static int keygen_internal(ML_DSA_CTX *ctx, const uint8_t *seed, size_t seed_len
    memcpy(out->rho, rho, sizeof(out->rho));
    memcpy(out->K, K, sizeof(out->K));

-    ret = ossl_ml_dsa_sample_expandS(ctx->h_ctx, params->eta, priv_seed,
-                                     &out->s1, &out->s2)
-        && public_from_private(ctx, out, &out->t1, &out->t0)
+    ret = vector_expand_S(ctx->h_ctx, params->eta, priv_seed, &out->s1, &out->s2)
+        && public_from_private(out, ctx->g_ctx, &out->t1, &out->t0)
        && ossl_ml_dsa_pk_encode(out)
        && shake_xof(ctx->h_ctx, out->pub_encoding, out->params->pk_len,
                     out->tr, sizeof(out->tr))
@ -305,8 +327,8 @@ int ossl_ml_dsa_generate_key(ML_DSA_CTX *ctx, OSSL_LIB_CTX *lib_ctx,
        return 0;

    if (entropy != NULL && entropy_len != 0) {
-        if (entropy_len < seed_len)
-            goto err;
+        if (entropy_len != seed_len)
+            return 0;
        memcpy(seed, entropy, seed_len);
    } else {
        if (RAND_priv_bytes_ex(lib_ctx, seed, seed_len, 0) <= 0)
@ -347,7 +369,7 @@ size_t ossl_ml_dsa_key_get_pub_len(const ML_DSA_KEY *key)

 size_t ossl_ml_dsa_key_get_collision_strength_bits(const ML_DSA_KEY *key)
 {
-    return key->params->strength;
+    return key->params->bit_strength;
 }

 /* Returns the private key data or NULL if there is no private key */
--- a/crypto/ml_dsa/ml_dsa_key_compress.c
+++ b/crypto/ml_dsa/ml_dsa_key_compress.c
@ -64,7 +64,7 @@ uint32_t ossl_ml_dsa_key_compress_high_bits(uint32_t r, uint32_t gamma2)
    uint32_t r1 = (r + 127) >> 7;

    /* TODO - figure out what this is doing */
-    if (gamma2 == ML_DSA_Q_MINUS1_DIV32) {
+    if (gamma2 == ML_DSA_GAMMA2_Q_MINUS1_DIV32) {
        r1 = (r1 * 1025 + (1 << 21)) >> 22;
        r1 &= 15; /* mod 16 */
        return r1;
--- a/crypto/ml_dsa/ml_dsa_local.h
+++ b/crypto/ml_dsa/ml_dsa_local.h
@ -12,28 +12,49 @@

 # include "crypto/ml_dsa.h"
 # include "internal/constant_time.h"
+# include "internal/packet.h"

-/* Maximimum size of the 'A' matrix */
-# define ML_DSA_K_MAX 8
-# define ML_DSA_L_MAX 7
-
+/* The following constants are shared by ML-DSA-44, ML-DSA-65 & ML-DSA-87 */
 # define ML_DSA_SEED_BYTES 32
 # define ML_DSA_Q 8380417   /* The modulus is 23 bits (2^23 - 2^13 + 1) */
 # define ML_DSA_Q_MINUS1_DIV2 ((ML_DSA_Q - 1) / 2)
-# define ML_DSA_Q_MINUS1_DIV32 ((ML_DSA_Q - 1) / 32)
+
 # define ML_DSA_Q_BITS 23
 # define ML_DSA_Q_INV 58728449  /* q^-1 satisfies: q^-1 * q = 1 mod 2^32 */
 # define ML_DSA_Q_NEG_INV 4236238847 /* Inverse of -q modulo 2^32 */
 # define ML_DSA_DEGREE_INV_MONTGOMERY 41978 /* Inverse of 256 mod q, in Montgomery form. */

-# define ML_DSA_D_BITS 13   /* The number of bits dropped from t */
+# define ML_DSA_D_BITS 13   /* The number of bits dropped from the public vector t */
 # define ML_DSA_NUM_POLY_COEFFICIENTS 256  /* The number of coefficients in the polynomials */
 # define ML_DSA_RHO_BYTES 32   /* p = Public Random Seed */
 # define ML_DSA_PRIV_SEED_BYTES 64 /* p' = Private random seed */
 # define ML_DSA_K_BYTES 32 /* K = Private random seed for signing */
-# define ML_DSA_TR_BYTES 64 /* Hash of public key used for signing */
-# define ML_DSA_MU_BYTES 64
-# define ML_DSA_RHO_PRIME_BYTES 64
+# define ML_DSA_TR_BYTES 64 /* Size of the Hash of the public key used for signing */
+# define ML_DSA_MU_BYTES 64 /* Size of the Hash for the message representative */
+# define ML_DSA_RHO_PRIME_BYTES 64 /* private random seed size */
+
+/*
+ * There is special case code related to encoding/decoding that tests the
+ * for the following values.
+ */
+/*
+ * The possible value for eta - If a new value is added, then all code
+ * that accesses ML_DSA_ETA_4 would need to be modified.
+ */
+# define ML_DSA_ETA_4 4
+# define ML_DSA_ETA_2 2
+/*
+ * The possible values of gamma1 - If a new value is added, then all code
+ * that accesses ML_DSA_GAMMA1_TWO_POWER_19 would need to be modified.
+ */
+# define ML_DSA_GAMMA1_TWO_POWER_19 (1 << 19)
+# define ML_DSA_GAMMA1_TWO_POWER_17 (1 << 17)
+/*
+ * The possible values for gamma2 - If a new value is added, then all code
+ * that accesses ML_DSA_GAMMA2_Q_MINUS1_DIV32 would need to be modified.
+ */
+# define ML_DSA_GAMMA2_Q_MINUS1_DIV32 ((ML_DSA_Q - 1) / 32)
+# define ML_DSA_GAMMA2_Q_MINUS1_DIV88 ((ML_DSA_Q - 1) / 88)

 typedef struct ml_dsa_params_st ML_DSA_PARAMS;
 typedef struct poly_st POLY;
@ -47,7 +68,8 @@ typedef struct matrix_st MATRIX;
 *   - OpenSSL also uses pre-fetched EVP_MD_CTX objects for Hashing purposes.
 *
 * ML_DSA_CTX is a container to hold all these objects. This object is
- * resolved early and is then passed to most ML-DSA related functions.
+ * resolved early and can then be used to pass these values to
+ * most ML-DSA related functions.
 */
 struct ml_dsa_ctx_st {
    const ML_DSA_PARAMS *params;
@ -55,9 +77,19 @@ struct ml_dsa_ctx_st {
    EVP_MD_CTX *g_ctx; /* SHAKE-128 */
 };

-int ossl_ml_dsa_sample_expandA(EVP_MD_CTX *g_ctx, const uint8_t *rho, MATRIX *out);
-int ossl_ml_dsa_sample_expandS(EVP_MD_CTX *h_ctx, int eta, const uint8_t *seed,
-                               VECTOR *s1, VECTOR *s2);
+typedef struct ml_dsa_sig_st ML_DSA_SIG;
+
+int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const uint8_t *rho, MATRIX *out);
+int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX *h_ctx, int eta, const uint8_t *seed,
+                                VECTOR *s1, VECTOR *s2);
+void ossl_ml_dsa_matrix_mult_vector(const MATRIX *matrix_kl, const VECTOR *vl,
+                                    VECTOR *vk);
+int ossl_ml_dsa_poly_expand_mask(POLY *out,
+                                 const uint8_t *seed, size_t seed_len,
+                                 uint32_t gamma1, EVP_MD_CTX *h_ctx);
+int ossl_ml_dsa_poly_sample_in_ball(POLY *out_c, const uint8_t *seed, int seed_len,
+                                    EVP_MD_CTX *h_ctx, uint32_t tau);
+
 void ossl_ml_dsa_poly_ntt(POLY *s);
 void ossl_ml_dsa_poly_ntt_inverse(POLY *s);
 void ossl_ml_dsa_poly_ntt_mult(const POLY *lhs, const POLY *rhs, POLY *out);
@ -75,9 +107,19 @@ uint32_t ossl_ml_dsa_key_compress_use_hint(uint32_t hint, uint32_t r,
                                           uint32_t gamma2);

 int ossl_ml_dsa_pk_encode(ML_DSA_KEY *key);
-int ossl_ml_dsa_pk_decode(const uint8_t *in, size_t in_len, ML_DSA_KEY *key);
+int ossl_ml_dsa_pk_decode(ML_DSA_KEY *key, const uint8_t *in, size_t in_len);
 int ossl_ml_dsa_sk_encode(ML_DSA_KEY *key);
-int ossl_ml_dsa_sk_decode(const uint8_t *in, size_t in_len, ML_DSA_KEY *key);
+int ossl_ml_dsa_sk_decode(ML_DSA_KEY *key, const uint8_t *in, size_t in_len);
+
+int ossl_ml_dsa_sig_encode(const ML_DSA_SIG *sig, const ML_DSA_PARAMS *params,
+                           uint8_t *out);
+int ossl_ml_dsa_sig_decode(ML_DSA_SIG *sig, const uint8_t *in, size_t in_len,
+                           const ML_DSA_PARAMS *params);
+int ossl_ml_dsa_w1_encode(const VECTOR *w1, uint32_t gamma2,
+                          uint8_t *out, size_t out_len);
+int ossl_ml_dsa_poly_decode_expand_mask(POLY *out,
+                                        const uint8_t *in, size_t in_len,
+                                        uint32_t gamma1);

 /*
 * @brief Reduces x mod q in constant time
@ -107,4 +149,35 @@ static ossl_inline ossl_unused uint32_t mod_sub(uint32_t a, uint32_t b)
    return reduce_once(ML_DSA_Q + a - b);
 }

+/*
+ * @brief Returns the absolute value in constant time.
+ * i.e. return is_positive(x) ? x : -x;
+ * Note: MSVC doesn't like applying the unary minus operator to unsigned types
+ * (warning C4146), so we write the negation as a bitwise not plus one
+ * (assuming two's complement representation).
+ */
+static ossl_inline ossl_unused uint32_t abs_signed(uint32_t x)
+{
+    return constant_time_select_int(constant_time_lt(x, 0x80000000), x, 0u - x);
+}
+
+/*
+ * @brief Returns the absolute value modulo q in constant time
+ * i.e return x > (q-1)/2 ? q - x : x;
+ */
+static ossl_inline ossl_unused uint32_t abs_mod_prime(uint32_t x)
+{
+    return constant_time_select_int(constant_time_lt(ML_DSA_Q_MINUS1_DIV2, x),
+                                                     ML_DSA_Q - x, x);
+}
+
+/*
+ * @brief Returns the maximum of two values in constant time.
+ * i.e return x < y ? y : x;
+ */
+static ossl_inline ossl_unused uint32_t maximum(uint32_t x, uint32_t y)
+{
+    return constant_time_select_int(constant_time_lt(x, y), y, x);
+}
+
 #endif /* OSSL_CRYPTO_ML_DSA_LOCAL_H */
--- a/crypto/ml_dsa/ml_dsa_matrix.h
+++ b/crypto/ml_dsa/ml_dsa_matrix.h
@ -7,12 +7,21 @@
 * https://www.openssl.org/source/license.html
 */

-/* A 'k' by 'l' Matrix object ('k' rows and 'l' columns) containing polynomial entries */
+/* A 'k' by 'l' Matrix object ('k' rows and 'l' columns) containing polynomial scalars */
 struct matrix_st {
    POLY *m_poly;
    size_t k, l;
 };

+/**
+ * @brief Initialize a Matrix object.
+ *
+ * @param m The matrix object.
+ * @param polys A preallocated array of k * l polynomial blocks. |m| does not
+ *              own/free this.
+ * @param k The number of rows
+ * @param l The number of columns
+ */
 static ossl_inline ossl_unused void
 matrix_init(MATRIX *m, POLY *polys, size_t k, size_t l)
 {
@ -21,5 +30,14 @@ matrix_init(MATRIX *m, POLY *polys, size_t k, size_t l)
    m->m_poly = polys;
 }

-void ossl_ml_dsa_matrix_mult_vector(const MATRIX *matrix_kl, const VECTOR *vl,
-                                    VECTOR *vk);
+static ossl_inline ossl_unused void
+matrix_mult_vector(const MATRIX *a, const VECTOR *s, VECTOR *t)
+{
+    ossl_ml_dsa_matrix_mult_vector(a, s, t);
+}
+
+static ossl_inline ossl_unused int
+matrix_expand_A(EVP_MD_CTX *g_ctx, const uint8_t *rho, MATRIX *out)
+{
+    return ossl_ml_dsa_matrix_expand_A(g_ctx, rho, out);
+}
--- a/crypto/ml_dsa/ml_dsa_params.c
+++ b/crypto/ml_dsa/ml_dsa_params.c
@ -11,14 +11,35 @@
 #include "ml_dsa_local.h"
 #include "ml_dsa_params.h"

-/*
- * See FIPS 204 Section 4 Table 1 & Table 2
- *                    tau strength gamma1 k l eta beta omega sc sklen  pklen siglen
- */
-#define OSSL_ML_DSA_65  49, 192, 1 << 19, 6, 5, 4, 196, 55, 3, 4032, 1952, 3309
+/* See FIPS 204 Section 4 Table 1 & Table 2 */
+#define ML_DSA_65_TAU 49
+#define ML_DSA_65_LAMBDA 192
+#define ML_DSA_65_K 6
+#define ML_DSA_65_L 5
+#define ML_DSA_65_ETA ML_DSA_ETA_4
+#define ML_DSA_65_BETA 196
+#define ML_DSA_65_OMEGA 55
+#define ML_DSA_65_SECURITY_CATEGORY 3
+#define ML_DSA_65_PRIV_LEN 4032
+#define ML_DSA_65_PUB_LEN 1952
+#define ML_DSA_65_SIG_LEN 3309

 static const ML_DSA_PARAMS ml_dsa_params[] = {
-    {"ML-DSA-65", OSSL_ML_DSA_65},
+    { "ML-DSA-65",
+      ML_DSA_65_TAU,
+      ML_DSA_65_LAMBDA,
+      ML_DSA_GAMMA1_TWO_POWER_19,
+      ML_DSA_GAMMA2_Q_MINUS1_DIV32,
+      ML_DSA_65_K,
+      ML_DSA_65_L,
+      ML_DSA_65_ETA,
+      ML_DSA_65_BETA,
+      ML_DSA_65_OMEGA,
+      ML_DSA_65_SECURITY_CATEGORY,
+      ML_DSA_65_PRIV_LEN,
+      ML_DSA_65_PUB_LEN,
+      ML_DSA_65_SIG_LEN
+    },
    {NULL},
 };

--- a/crypto/ml_dsa/ml_dsa_params.h
+++ b/crypto/ml_dsa/ml_dsa_params.h
@ -16,8 +16,9 @@
 struct ml_dsa_params_st {
    const char *alg;
    int tau;    /* Number of +/-1's in polynomial c */
-    int strength; /* The collision strength */
+    int bit_strength; /* The collision strength (lambda) */
    int gamma1; /* coefficient range of y */
+    int gamma2; /* coefficient range of ? */
    size_t k, l; /* matrix dimensions of 'A' */
    int eta;    /* Private key range */
    int beta;   /* tau * eta */
--- a/crypto/ml_dsa/ml_dsa_poly.h
+++ b/crypto/ml_dsa/ml_dsa_poly.h
@ -15,6 +15,12 @@ struct poly_st {
    uint32_t coeff[ML_DSA_NUM_POLY_COEFFICIENTS];
 };

+static ossl_inline ossl_unused void
+poly_zero(POLY *p)
+{
+    memset(p->coeff, 0, sizeof(*p));
+}
+
 /**
 * @brief Polynomial addition.
 *
@ -58,6 +64,29 @@ poly_equal(const POLY *a, const POLY *b)
    return CRYPTO_memcmp(a, b, sizeof(*a)) == 0;
 }

+static ossl_inline ossl_unused void
+poly_ntt(POLY *p)
+{
+    ossl_ml_dsa_poly_ntt(p);
+}
+
+static ossl_inline ossl_unused int
+poly_sample_in_ball_ntt(POLY *out, const uint8_t *seed, int seed_len,
+                        EVP_MD_CTX *h_ctx, uint32_t tau)
+{
+    if (!ossl_ml_dsa_poly_sample_in_ball(out, seed, seed_len, h_ctx, tau))
+        return 0;
+    poly_ntt(out);
+    return 1;
+}
+
+static ossl_inline ossl_unused int
+poly_expand_mask(POLY *out, const uint8_t *seed, size_t seed_len,
+                 uint32_t gamma1, EVP_MD_CTX *h_ctx)
+{
+    return ossl_ml_dsa_poly_expand_mask(out, seed, seed_len, gamma1, h_ctx);
+}
+
 /**
 * @brief Decompose the coefficients of a polynomial into (r1, r0) such that
 * coeff[i] == t1[i] * 2^13 + t0[i] mod q
@ -76,5 +105,107 @@ poly_power2_round(const POLY *t, POLY *t1, POLY *t0)

    for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS; i++)
        ossl_ml_dsa_key_compress_power2_round(t->coeff[i],
-                                              &t1->coeff[i], &t0->coeff[i]);
+                                              t1->coeff + i, t0->coeff + i);
 }
+
+static ossl_inline ossl_unused void
+poly_scale_power2_round(POLY *in, POLY *out)
+{
+    int i;
+
+    for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS; i++)
+        out->coeff[i] = (in->coeff[i] << ML_DSA_D_BITS);
+}
+
+static ossl_inline ossl_unused void
+poly_high_bits(const POLY *in, uint32_t gamma2, POLY *out)
+{
+    int i;
+
+    for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS; i++)
+        out->coeff[i] = ossl_ml_dsa_key_compress_high_bits(in->coeff[i], gamma2);
+}
+
+static ossl_inline ossl_unused void
+poly_low_bits(const POLY *in, uint32_t gamma2, POLY *out)
+{
+    int i;
+
+    for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS; i++)
+        out->coeff[i] = ossl_ml_dsa_key_compress_low_bits(in->coeff[i], gamma2);
+}
+
+static ossl_inline ossl_unused void
+poly_make_hint(const POLY *ct0, const POLY *cs2, const POLY *w, uint32_t gamma2,
+               POLY *out)
+{
+    int i;
+
+    for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS; i++)
+        out->coeff[i] = ossl_ml_dsa_key_compress_make_hint(ct0->coeff[i],
+                                                           cs2->coeff[i],
+                                                           gamma2, w->coeff[i]);
+}
+
+static ossl_inline ossl_unused void
+poly_use_hint(const POLY *h, const POLY *r, uint32_t gamma2, POLY *out)
+{
+    int i;
+
+    for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS; i++)
+        out->coeff[i] = ossl_ml_dsa_key_compress_use_hint(h->coeff[i],
+                                                          r->coeff[i], gamma2);
+}
+
+static ossl_inline ossl_unused void
+poly_max(const POLY *p, uint32_t *mx)
+{
+    int i;
+
+    for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS; i++) {
+        uint32_t c = p->coeff[i];
+        uint32_t abs = abs_mod_prime(c);
+
+        *mx = maximum(*mx, abs);
+    }
+}
+
+static ossl_inline ossl_unused void
+poly_max_signed(const POLY *p, uint32_t *mx)
+{
+    int i;
+
+    for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS; i++) {
+        uint32_t c = p->coeff[i];
+        uint32_t abs = abs_signed(c);
+
+        *mx = maximum(*mx, abs);
+    }
+}
+
+#if defined(ML_DSA_DEBUG)
+static ossl_inline ossl_unused void poly_print(const POLY *p)
+{
+    size_t i;
+
+    for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS; ++i) {
+        if (i > 0 && ((i & 31) == 0))
+            printf("\n");
+        printf("%3x,", p->coeff[i]);
+    }
+    printf("\n");
+}
+
+static ossl_inline ossl_unused void poly_print_signed(const POLY *p)
+{
+    size_t i;
+
+    for (i = 0; i < ML_DSA_NUM_POLY_COEFFICIENTS; ++i) {
+        if (i > 0 && ((i & 31) == 0))
+            printf("\n");
+        printf("%3d,", p->coeff[i] > ML_DSA_Q_MINUS1_DIV2
+                       ? (int)p->coeff[i] - (int)ML_DSA_Q : (int)p->coeff[i]);
+    }
+    printf("\n");
+}
+#endif
--- a/crypto/ml_dsa/ml_dsa_sample.c
+++ b/crypto/ml_dsa/ml_dsa_sample.c
@ -7,13 +7,15 @@
 * https://www.openssl.org/source/license.html
 */

-#include <openssl/evp.h>
 #include "ml_dsa_local.h"
 #include "ml_dsa_vector.h"
 #include "ml_dsa_matrix.h"
+#include "ml_dsa_hash.h"
+#include "internal/sha3.h"
+#include "internal/packet.h"

-#define SHAKE128_BLOCKSIZE 168
-#define SHAKE256_BLOCKSIZE 136
+#define SHAKE128_BLOCKSIZE SHA3_BLOCKSIZE(128)
+#define SHAKE256_BLOCKSIZE SHA3_BLOCKSIZE(256)

 typedef int (COEFF_FROM_NIBBLE_FUNC)(uint32_t nibble, uint32_t *out);

@ -104,23 +106,22 @@ static int rej_ntt_poly(EVP_MD_CTX *g_ctx,
    int j = 0;
    uint8_t blocks[SHAKE128_BLOCKSIZE], *b, *end = blocks + sizeof(blocks);

-    if (EVP_DigestInit_ex2(g_ctx, NULL, NULL) != 1
-            || EVP_DigestUpdate(g_ctx, seed, seed_len) != 1)
+    /*
+     * Instead of just squeezing 3 bytes at a time, we grab a whole block
+     * Note that the shake128 blocksize of 168 is divisible by 3.
+     */
+    if (!shake_xof(g_ctx, seed, seed_len, blocks, sizeof(blocks)))
        return 0;

    while (1) {
-        /*
-         * Instead of just squeezing 3 bytes at a time, we grab a whole block
-         * Note that the shake128 blocksize of 168 is divisible by 3.
-         */
-        if (!EVP_DigestSqueeze(g_ctx, blocks, sizeof(blocks)))
-            return 0;
        for (b = blocks; b < end; b += 3) {
            if (coeff_from_three_bytes(b, &(out->coeff[j]))) {
                if (++j >= ML_DSA_NUM_POLY_COEFFICIENTS)
                    return 1;   /* finished */
            }
        }
+        if (!EVP_DigestSqueeze(g_ctx, blocks, sizeof(blocks)))
+            return 0;
    }
 }

@ -148,14 +149,11 @@ static int rej_bounded_poly(EVP_MD_CTX *h_ctx,
    uint32_t z0, z1;
    uint8_t blocks[SHAKE256_BLOCKSIZE], *b, *end = blocks + sizeof(blocks);

-    if (EVP_DigestInit_ex2(h_ctx, NULL, NULL) != 1
-            || EVP_DigestUpdate(h_ctx, seed, seed_len) != 1)
+    /* Instead of just squeezing 1 byte at a time, we grab a whole block */
+    if (!shake_xof(h_ctx, seed, seed_len, blocks, sizeof(blocks)))
        return 0;

    while (1) {
-        /* Instead of just squeezing 1 byte at a time, we grab a whole block */
-        if (!EVP_DigestSqueeze(h_ctx, blocks, sizeof(blocks)))
-            return 0;
        for (b = blocks; b < end; b++) {
            z0 = *b & 0x0F; /* lower nibble of byte */
            z1 = *b >> 4;   /* high nibble of byte */
@ -167,6 +165,8 @@ static int rej_bounded_poly(EVP_MD_CTX *h_ctx,
                    && ++j >= ML_DSA_NUM_POLY_COEFFICIENTS)
                return 1;
        }
+        if (!EVP_DigestSqueeze(h_ctx, blocks, sizeof(blocks)))
+            return 0;
    }
 }

@ -182,8 +182,8 @@ static int rej_bounded_poly(EVP_MD_CTX *h_ctx,
 *            in the range of 0..q-1.
 * @returns 1 if the matrix was generated, or 0 on error.
 */
-int ossl_ml_dsa_sample_expandA(EVP_MD_CTX *g_ctx, const uint8_t *rho,
-                               MATRIX *out)
+int ossl_ml_dsa_matrix_expand_A(EVP_MD_CTX *g_ctx, const uint8_t *rho,
+                                MATRIX *out)
 {
    int ret = 0;
    size_t i, j;
@ -224,8 +224,8 @@ err:
 *           the range (q-eta)..0..eta
 * @returns 1 if s1 and s2 were successfully generated, or 0 otherwise.
 */
-int ossl_ml_dsa_sample_expandS(EVP_MD_CTX *h_ctx, int eta, const uint8_t *seed,
-                               VECTOR *s1, VECTOR *s2)
+int ossl_ml_dsa_vector_expand_S(EVP_MD_CTX *h_ctx, int eta, const uint8_t *seed,
+                                VECTOR *s1, VECTOR *s2)
 {
    int ret = 0;
    size_t i;
@ -234,7 +234,7 @@ int ossl_ml_dsa_sample_expandS(EVP_MD_CTX *h_ctx, int eta, const uint8_t *seed,
    uint8_t derived_seed[ML_DSA_PRIV_SEED_BYTES + 2];
    COEFF_FROM_NIBBLE_FUNC *coef_from_nibble_fn;

-    coef_from_nibble_fn = (eta == 4) ? coeff_from_nibble_4 : coeff_from_nibble_2;
+    coef_from_nibble_fn = (eta == ML_DSA_ETA_4) ? coeff_from_nibble_4 : coeff_from_nibble_2;

    /*
     * Each polynomial generated uses a unique seed that consists of
@ -260,3 +260,82 @@ int ossl_ml_dsa_sample_expandS(EVP_MD_CTX *h_ctx, int eta, const uint8_t *seed,
 err:
    return ret;
 }
+
+/* See FIPS 204, Algorithm 34, ExpandMask(), Step 4 & 5 */
+int ossl_ml_dsa_poly_expand_mask(POLY *out,
+                                 const uint8_t *seed, size_t seed_len,
+                                 uint32_t gamma1, EVP_MD_CTX *h_ctx)
+{
+    uint8_t buf[32 * 20];
+    size_t buf_len = 32 * (gamma1 == ML_DSA_GAMMA1_TWO_POWER_19 ? 20 : 18);
+
+    return shake_xof(h_ctx, seed, seed_len, buf, buf_len)
+        && ossl_ml_dsa_poly_decode_expand_mask(out, buf, buf_len, gamma1);
+}
+
+/*
+ * @brief Sample a polynomial with coefficients in the range {-1..1}.
+ * The number of non zero values (hamming weight) is given by tau
+ *
+ * See FIPS 204, Algorithm 29, SampleInBall()
+ * This function is assumed to not be constant time.
+ * The algorithm is based on Durstenfeld's version of the Fisher-Yates shuffle.
+ *
+ * Note that the coefficients returned by this implementation are positive
+ * i.e one of q-1, 0, or 1.
+ *
+ * @param tau is the number of +1 or -1's in the polynomial 'out_c' (39, 49 or 60)
+ *            that is less than or equal to 64
+ */
+int ossl_ml_dsa_poly_sample_in_ball(POLY *out_c, const uint8_t *seed, int seed_len,
+                                    EVP_MD_CTX *h_ctx, uint32_t tau)
+{
+    uint8_t block[SHAKE256_BLOCKSIZE];
+    uint64_t signs;
+    int offset = 8;
+    size_t end;
+
+    /*
+     * Rather than squeeze 8 bytes followed by lots of 1 byte squeezes
+     * the SHAKE blocksize is squeezed each time and buffered into 'block'.
+     */
+    if (!shake_xof(h_ctx, seed, seed_len, block, sizeof(block)))
+        return 0;
+
+    /*
+     * grab the first 64 bits - since tau < 64
+     * Each bit gives a +1 or -1 value.
+     */
+    memcpy(&signs, block, 8);
+
+    poly_zero(out_c);
+
+    /* Loop tau times */
+    for (end = 256 - tau; end < 256; end++) {
+        size_t index; /* index is a random offset to write +1 or -1 */
+
+        /* rejection sample in {0..end} to choose an index to place -1 or 1 into */
+        for (;;) {
+            if (offset == sizeof(block)) {
+                /* squeeze another block if the bytes from block have been used */
+                if (!EVP_DigestSqueeze(h_ctx, block, sizeof(block)))
+                    return 0;
+                offset = 0;
+            }
+
+            index = block[offset++];
+            if (index <= end)
+                break;
+        }
+
+        /*
+         * In-place swap the coefficient we are about to replace to the end so
+         * we don't lose any values that have been already written.
+         */
+        out_c->coeff[end] = out_c->coeff[index];
+        /* set the random coefficient value to either 1 or q-1 */
+        out_c->coeff[index] = mod_sub(1, 2 * (signs & 1));
+        signs >>= 1; /* grab the next random bit */
+    }
+    return 1;
+}
--- a/crypto/ml_dsa/ml_dsa_sign.c
+++ b/crypto/ml_dsa/ml_dsa_sign.c
@ -15,150 +15,380 @@
 #include "ml_dsa_key.h"
 #include "ml_dsa_params.h"
 #include "ml_dsa_matrix.h"
+#include "ml_dsa_sign.h"
+#include "ml_dsa_hash.h"

+#define ML_DSA_MAX_LAMBDA 256 /* bit strength for ML-DSA-87 */
+
+/*
+ * @brief Initialize a Signature object by pointing all of its objects to
+ * preallocated blocks. The values passed for hint, z and
+ * c_tilde values are not owned/freed by the |sig| object.
+ *
+ * @param sig The ML_DSA_SIG to initialize.
+ * @param hint A preallocated array of |k| polynomial blocks
+ * @param k The number of |hint| polynomials
+ * @param z A preallocated array of |l| polynomial blocks
+ * @param l The number of |z| polynomials
+ * @param c_tilde A preallocated buffer
+ * @param c_tilde_len The size of |c_tilde|
+ */
+static void signature_init(ML_DSA_SIG *sig,
+                           POLY *hint, uint32_t k, POLY *z, uint32_t l,
+                           uint8_t *c_tilde, size_t c_tilde_len)
+{
+    vector_init(&sig->z, z, l);
+    vector_init(&sig->hint, hint, k);
+    sig->c_tilde = c_tilde;
+    sig->c_tilde_len = c_tilde_len;
+}

 /*
 * FIPS 204, Algorithm 7, ML-DSA.Sign_internal()
 * @returns 1 on success and 0 on failure.
 */
-template <int K, int L>
-static int ossl_ml_dsa_sign_internal(
-    uint8_t out_encoded_signature[signature_bytes<K>()],
-    const struct private_key<K, L> *priv, const uint8_t *msg, size_t msg_len,
-    const uint8_t *context_prefix, size_t context_prefix_len,
-    const uint8_t *context, size_t context_len,
-    const uint8_t randomizer[MLDSA_SIGNATURE_RANDOMIZER_BYTES]) {
-  uint8_t mu[kMuBytes];
-  struct BORINGSSL_keccak_st keccak_ctx;
-  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
-  BORINGSSL_keccak_absorb(&keccak_ctx, priv->public_key_hash,
-                          sizeof(priv->public_key_hash));
-  BORINGSSL_keccak_absorb(&keccak_ctx, context_prefix, context_prefix_len);
-  BORINGSSL_keccak_absorb(&keccak_ctx, context, context_len);
-  BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len);
-  BORINGSSL_keccak_squeeze(&keccak_ctx, mu, kMuBytes);
+static int ml_dsa_sign_internal(ML_DSA_CTX *ctx, const ML_DSA_KEY *priv,
+                                const uint8_t *encoded_msg,
+                                size_t encoded_msg_len,
+                                const uint8_t *rnd, size_t rnd_len,
+                                uint8_t *out_sig)
+{
+    int ret = 0;
+    const ML_DSA_PARAMS *params = ctx->params;
+    EVP_MD_CTX *h_ctx = ctx->h_ctx;
+    uint32_t k = params->k, l = params->l;
+    uint32_t gamma1 = params->gamma1, gamma2 = params->gamma2;
+    uint8_t *alloc = NULL, *w1_encoded;
+    size_t w1_encoded_len = 128 * k;
+    size_t num_polys_sig_k = 2 * k;
+    size_t num_polys_k = 5 * k;
+    size_t num_polys_l = 3 * l;
+    size_t num_polys_k_by_l = k * l;
+    POLY *polys = NULL, *p, *c_ntt;
+    size_t alloc_len = w1_encoded_len
+                       + sizeof(*polys)
+                       * (1 + num_polys_k + num_polys_l
+                          + num_polys_k_by_l + num_polys_sig_k);
+    VECTOR s1_ntt, s2_ntt, t0_ntt, w, w1, cs1, cs2, y;
+    MATRIX a_ntt;
+    ML_DSA_SIG sig;
+    uint8_t mu[ML_DSA_MU_BYTES];
+    uint8_t rho_prime[ML_DSA_RHO_PRIME_BYTES];
+    uint8_t c_tilde[ML_DSA_MAX_LAMBDA / 4];
+    size_t c_tilde_len = params->bit_strength >> 2;
+    size_t kappa;

-  uint8_t rho_prime[kRhoPrimeBytes];
-  BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
-  BORINGSSL_keccak_absorb(&keccak_ctx, priv->k, sizeof(priv->k));
-  BORINGSSL_keccak_absorb(&keccak_ctx, randomizer,
-                          MLDSA_SIGNATURE_RANDOMIZER_BYTES);
-  BORINGSSL_keccak_absorb(&keccak_ctx, mu, kMuBytes);
-  BORINGSSL_keccak_squeeze(&keccak_ctx, rho_prime, kRhoPrimeBytes);
+    /*
+     * Allocate a single blob for most of the variable size temporary variables.
+     * Mostly used for VECTOR POLYNOMIALS (every POLY is 1K).
+     */
+    alloc = OPENSSL_malloc(alloc_len);
+    if (alloc == NULL)
+        return 0;
+    w1_encoded = alloc;
+    /* Init the temp vectors to point to the allocated polys blob */
+    p = (POLY *)(w1_encoded + w1_encoded_len);
+    c_ntt = p++;
+    matrix_init(&a_ntt, p, k, l);
+    p += num_polys_k_by_l;
+    vector_init(&s2_ntt, p, k);
+    vector_init(&t0_ntt, s2_ntt.poly + k, k);
+    vector_init(&w, t0_ntt.poly + k, k);
+    vector_init(&w1, w.poly + k, k);
+    vector_init(&cs2, w1.poly + k, k);
+    p += num_polys_k;
+    vector_init(&s1_ntt, p, l);
+    vector_init(&y, p + l, l);
+    vector_init(&cs1, p + 2 * l, l);
+    p += num_polys_l;
+    signature_init(&sig, p, k, p + k, l, c_tilde, c_tilde_len);
+    /* End of the allocated blob setup */

-  // Intermediate values, allocated on the heap to allow use when there is a
-  // limited amount of stack.
-  struct values_st {
-    struct signature<K, L> sign;
-    vector<L> s1_ntt;
-    vector<K> s2_ntt;
-    vector<K> t0_ntt;
-    matrix<K, L> a_ntt;
-    vector<L> y;
-    vector<K> w;
-    vector<K> w1;
-    vector<L> cs1;
-    vector<K> cs2;
-  };
-  std::unique_ptr<values_st, DeleterFree<values_st>> values(
-      reinterpret_cast<struct values_st *>(OPENSSL_malloc(sizeof(values_st))));
-  if (values == NULL) {
-    return 0;
-  }
-  OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt));
-  vector_ntt(&values->s1_ntt);
+    if (!matrix_expand_A(ctx->g_ctx, priv->rho, &a_ntt)
+            || !shake_xof_2(h_ctx, priv->tr, sizeof(priv->tr),
+                            encoded_msg, encoded_msg_len, mu, sizeof(mu))
+            || !shake_xof_3(h_ctx, priv->K, sizeof(priv->K), rnd, rnd_len,
+                            mu, sizeof(mu), rho_prime, sizeof(rho_prime)))
+        goto err;

-  OPENSSL_memcpy(&values->s2_ntt, &priv->s2, sizeof(values->s2_ntt));
-  vector_ntt(&values->s2_ntt);
+    vector_copy(&s1_ntt, &priv->s1);
+    vector_ntt(&s1_ntt);
+    vector_copy(&s2_ntt, &priv->s2);
+    vector_ntt(&s2_ntt);
+    vector_copy(&t0_ntt, &priv->t0);
+    vector_ntt(&t0_ntt);

-  OPENSSL_memcpy(&values->t0_ntt, &priv->t0, sizeof(values->t0_ntt));
-  vector_ntt(&values->t0_ntt);
+    /*
+     * kappa must not exceed 2^16. But the probability of it
+     * exceeding even 1000 iterations is vanishingly small.
+     */
+    for (kappa = 0; ; kappa += l) {
+        VECTOR *y_ntt = &cs1;
+        VECTOR *r0 = &w1;
+        VECTOR *ct0 = &w1;
+        uint32_t z_max, r0_max, ct0_max, h_ones;

-  matrix_expand(&values->a_ntt, priv->rho);
+        vector_expand_mask(&y, rho_prime, sizeof(rho_prime), kappa,
+                           gamma1, ctx->h_ctx);
+        vector_copy(y_ntt, &y);
+        vector_ntt(y_ntt);

-  // kappa must not exceed 2**16/L = 13107. But the probability of it
-  // exceeding even 1000 iterations is vanishingly small.
-  for (size_t kappa = 0;; kappa += L) {
-    vector_expand_mask(&values->y, rho_prime, kappa);
+        matrix_mult_vector(&a_ntt, y_ntt, &w);
+        vector_ntt_inverse(&w);

-    vector<L> *y_ntt = &values->cs1;
-    OPENSSL_memcpy(y_ntt, &values->y, sizeof(*y_ntt));
-    vector_ntt(y_ntt);
+        vector_high_bits(&w, gamma2, &w1);
+        ossl_ml_dsa_w1_encode(&w1, gamma2, w1_encoded, w1_encoded_len);

-    matrix_mult(&values->w, &values->a_ntt, y_ntt);
-    vector_inverse_ntt(&values->w);
+        if (!shake_xof_2(h_ctx, mu, sizeof(mu), w1_encoded, 128 * k,
+                         c_tilde, c_tilde_len))
+            break;

-    vector_high_bits(&values->w1, &values->w);
-    uint8_t w1_encoded[128 * K];
-    w1_encode(w1_encoded, &values->w1);
+        if (!poly_sample_in_ball_ntt(c_ntt, c_tilde, c_tilde_len, ctx->h_ctx,
+                                     params->tau))
+            break;

-    BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256);
-    BORINGSSL_keccak_absorb(&keccak_ctx, mu, kMuBytes);
-    BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, 128 * K);
-    BORINGSSL_keccak_squeeze(&keccak_ctx, values->sign.c_tilde,
-                             2 * lambda_bytes<K>());
+        vector_mult_scalar(&s1_ntt, c_ntt, &cs1);
+        vector_ntt_inverse(&cs1);
+        vector_mult_scalar(&s2_ntt, c_ntt, &cs2);
+        vector_ntt_inverse(&cs2);

-    scalar c_ntt;
-    scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde,
-                                  sizeof(values->sign.c_tilde), tau<K>());
-    scalar_ntt(&c_ntt);
+        vector_add(&y, &cs1, &sig.z);

-    vector_mult_scalar(&values->cs1, &values->s1_ntt, &c_ntt);
-    vector_inverse_ntt(&values->cs1);
-    vector_mult_scalar(&values->cs2, &values->s2_ntt, &c_ntt);
-    vector_inverse_ntt(&values->cs2);
+        /* r0 = lowbits(w - cs2) */
+        vector_sub(&w, &cs2, r0);
+        vector_low_bits(r0, gamma2, r0);

-    vector_add(&values->sign.z, &values->y, &values->cs1);
+        /*
+         * Leaking that the signature is rejected is fine as the next attempt at a
+         * signature will be (indistinguishable from) independent of this one.
+         */
+        z_max = vector_max(&sig.z);
+        r0_max = vector_max_signed(r0);
+        if (value_barrier_32(constant_time_ge(z_max, gamma1 - params->beta)
+                             | constant_time_ge(r0_max, gamma2 - params->beta)))
+            continue;

-    vector<K> *r0 = &values->w1;
-    vector_sub(r0, &values->w, &values->cs2);
-    vector_low_bits(r0, r0);
+        vector_mult_scalar(&t0_ntt, c_ntt, ct0);
+        vector_ntt_inverse(ct0);
+        vector_make_hint(ct0, &cs2, &w, gamma2, &sig.hint);

-    // Leaking the fact that a signature was rejected is fine as the next
-    // attempt at a signature will be (indistinguishable from) independent of
-    // this one. Note, however, that we additionally leak which of the two
-    // branches rejected the signature. Section 5.5 of
-    // https://pq-crystals.org/dilithium/data/dilithium-specification-round3.pdf
-    // describes this leak as OK. Note we leak less than what is described by
-    // the paper; we do not reveal which coefficient violated the bound, and
-    // we hide which of the |z_max| or |r0_max| bound failed. See also
-    // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/2bbab0fa_d241d35a/
-    uint32_t z_max = vector_max(&values->sign.z);
-    uint32_t r0_max = vector_max_signed(r0);
-    if (constant_time_declassify_w(
-            constant_time_ge_w(z_max, gamma1<K>() - beta<K>()) |
-            constant_time_ge_w(r0_max, kGamma2 - beta<K>()))) {
-      continue;
+        ct0_max = vector_max(ct0);
+        h_ones = vector_count_ones(&sig.hint);
+        /* Same reasoning applies to the leak as above */
+        if (value_barrier_32(constant_time_ge(ct0_max, gamma2)
+                             | constant_time_lt(params->omega, h_ones)))
+            continue;
+        ret = ossl_ml_dsa_sig_encode(&sig, params, out_sig);
+        break;
    }
-
-    vector<K> *ct0 = &values->w1;
-    vector_mult_scalar(ct0, &values->t0_ntt, &c_ntt);
-    vector_inverse_ntt(ct0);
-    vector_make_hint(&values->sign.h, ct0, &values->cs2, &values->w);
-
-    // See above.
-    uint32_t ct0_max = vector_max(ct0);
-    size_t h_ones = vector_count_ones(&values->sign.h);
-    if (constant_time_declassify_w(constant_time_ge_w(ct0_max, kGamma2) |
-                                   constant_time_lt_w(omega<K>(), h_ones))) {
-      continue;
-    }
-
-    // Although computed with the private key, the signature is public.
-    CONSTTIME_DECLASSIFY(values->sign.c_tilde, sizeof(values->sign.c_tilde));
-    CONSTTIME_DECLASSIFY(&values->sign.z, sizeof(values->sign.z));
-    CONSTTIME_DECLASSIFY(&values->sign.h, sizeof(values->sign.h));
-
-    CBB cbb;
-    CBB_init_fixed(&cbb, out_encoded_signature, signature_bytes<K>());
-    if (!mldsa_marshal_signature(&cbb, &values->sign)) {
-      return 0;
-    }
-
-    BSSL_CHECK(CBB_len(&cbb) == signature_bytes<K>());
-    return 1;
-  }
+err:
+    OPENSSL_clear_free(alloc, alloc_len);
+    OPENSSL_cleanse(rho_prime, sizeof(rho_prime));
+    return ret;
 }

+/*
+ * See FIPS 204, Algorithm 8, ML-DSA.Verify_internal().
+ */
+static int ml_dsa_verify_internal(ML_DSA_CTX *ctx, const ML_DSA_KEY *pub,
+                                  const uint8_t *msg_enc, size_t msg_enc_len,
+                                  const uint8_t *sig_enc, size_t sig_enc_len)
+{
+    int ret = 0;
+    uint8_t *alloc = NULL, *w1_encoded;
+    POLY *polys = NULL, *p, *c_ntt;
+    MATRIX a_ntt;
+    VECTOR az_ntt, ct1_ntt, *z_ntt, *w1, *w_approx;
+    ML_DSA_SIG sig;
+    const ML_DSA_PARAMS *params = ctx->params;
+    uint32_t k = pub->params->k;
+    uint32_t l = pub->params->l;
+    size_t w1_encoded_len = 128 * k;
+    size_t num_polys_sig = k + l;
+    size_t num_polys_k = 2 * k;
+    size_t num_polys_l = 1 * l;
+    size_t num_polys_k_by_l = k * l;
+    uint8_t mu[ML_DSA_MU_BYTES];
+    uint8_t c_tilde[ML_DSA_MAX_LAMBDA / 4];
+    uint8_t c_tilde_sig[ML_DSA_MAX_LAMBDA / 4];
+    EVP_MD_CTX *h_ctx = ctx->h_ctx;
+    size_t c_tilde_len = params->bit_strength >> 2;
+    uint32_t z_max;

+    /* Allocate space for all the POLYNOMIALS used by temporary VECTORS */
+    alloc = OPENSSL_malloc(w1_encoded_len
+                           + sizeof(*polys) * (1 + num_polys_k
+                                               + num_polys_l
+                                               + num_polys_k_by_l
+                                               + num_polys_sig));
+    if (alloc == NULL)
+        return 0;
+    w1_encoded = alloc;
+    /* Init the temp vectors to point to the allocated polys blob */
+    p = (POLY *)(w1_encoded + w1_encoded_len);
+    c_ntt = p++;
+    matrix_init(&a_ntt, p, k, l);
+    p += num_polys_k_by_l;
+    signature_init(&sig, p, k, p + k, l, c_tilde_sig, c_tilde_len);
+    p += num_polys_sig;
+    vector_init(&az_ntt, p, k);
+    vector_init(&ct1_ntt, p + k, k);

+    if (!ossl_ml_dsa_sig_decode(&sig, sig_enc, sig_enc_len, ctx->params)
+            || !matrix_expand_A(ctx->g_ctx, pub->rho, &a_ntt)
+            || !shake_xof_2(h_ctx, pub->tr, sizeof(pub->tr),
+                            msg_enc, msg_enc_len, mu, sizeof(mu)))
+        goto err;
+    /* Compute verifiers challenge c_ntt = NTT(SampleInBall(c_tilde) */
+    if (!poly_sample_in_ball_ntt(c_ntt, c_tilde_sig, c_tilde_len, ctx->h_ctx,
+                                 params->tau))
+        goto err;
+
+    /* ct1_ntt = NTT(c) * NTT(t1 * 2^d) */
+    vector_scale_power2_round_ntt(&pub->t1, &ct1_ntt);
+    vector_mult_scalar(&ct1_ntt, c_ntt, &ct1_ntt);
+
+    /* compute z_max early in order to reuse sig.z */
+    z_max = vector_max(&sig.z);
+
+    /* w_approx = NTT_inverse(A * NTT(z) - ct1_ntt) */
+    z_ntt = &sig.z;
+    vector_ntt(z_ntt);
+    matrix_mult_vector(&a_ntt, z_ntt, &az_ntt);
+    w_approx = &az_ntt;
+    vector_sub(&az_ntt, &ct1_ntt, w_approx);
+    vector_ntt_inverse(w_approx);
+
+    /* compute w1_encoded */
+    w1 = w_approx;
+    vector_use_hint(&sig.hint, w_approx, params->gamma2, w1);
+    ossl_ml_dsa_w1_encode(w1, params->gamma2, w1_encoded, w1_encoded_len);
+
+    if (!shake_xof_3(h_ctx, mu, sizeof(mu), w1_encoded, w1_encoded_len, NULL, 0,
+                     c_tilde, c_tilde_len))
+        goto err;
+
+    ret = (z_max < (uint32_t)(params->gamma1 - params->beta))
+        && memcmp(c_tilde, sig.c_tilde, c_tilde_len) == 0;
+err:
+    OPENSSL_free(alloc);
+    return ret;
+}
+
+/**
+ * @brief Encode a message
+ * See FIPS 204 Algorithm 2 Step 10 (and algorithm 3 Step 5).
+ *
+ * ML_DSA pure signatures are encoded as M' = 00 || ctx_len || ctx || msg
+ * Where ctx is the empty string by default and ctx_len <= 255.
+ *
+ * Note this code could be shared with SLH_DSA
+ *
+ * @param msg A message to encode
+ * @param msg_len The size of |msg|
+ * @param ctx An optional context to add to the message encoding.
+ * @param ctx_len The size of |ctx|. It must be in the range 0..255
+ * @param encode Use the Pure signature encoding if this is 1, and dont encode
+ *               if this value is 0.
+ * @param tmp A small buffer that may be used if the message is small.
+ * @param tmp_len The size of |tmp|
+ * @param out_len The size of the returned encoded buffer.
+ * @returns A buffer containing the encoded message. If the passed in
+ * |tmp| buffer is big enough to hold the encoded message then it returns |tmp|
+ * otherwise it allocates memory which must be freed by the caller. If |encode|
+ * is 0 then it returns |msg|. NULL is returned if there is a failure.
+ */
+static uint8_t *msg_encode(const uint8_t *msg, size_t msg_len,
+                           const uint8_t *ctx, size_t ctx_len, int encode,
+                           uint8_t *tmp, size_t tmp_len, size_t *out_len)
+{
+    uint8_t *encoded = NULL;
+    size_t encoded_len;
+
+    if (encode == 0) {
+        /* Raw message */
+        *out_len = msg_len;
+        return (uint8_t *)msg;
+    }
+    if (ctx_len > ML_DSA_MAX_CONTEXT_STRING_LEN)
+        return NULL;
+
+    /* Pure encoding */
+    encoded_len = 1 + 1 + ctx_len + msg_len;
+    *out_len = encoded_len;
+    if (encoded_len <= tmp_len) {
+        encoded = tmp;
+    } else {
+        encoded = OPENSSL_zalloc(encoded_len);
+        if (encoded == NULL)
+            return NULL;
+    }
+    encoded[0] = 0;
+    encoded[1] = (uint8_t)ctx_len;
+    memcpy(&encoded[2], ctx, ctx_len);
+    memcpy(&encoded[2 + ctx_len], msg, msg_len);
+    return encoded;
+}
+
+/**
+ * See FIPS 204 Section 5.2 Algorithm 2 ML-DSA.Sign()
+ *
+ * @returns 1 on success, or 0 on error.
+ */
+int ossl_ml_dsa_sign(ML_DSA_CTX *ctx, const ML_DSA_KEY *priv,
+                     const uint8_t *msg, size_t msg_len,
+                     const uint8_t *context, size_t context_len,
+                     const uint8_t *rand, size_t rand_len, int encode,
+                     unsigned char *sig, size_t *sig_len, size_t sig_size)
+{
+    int ret = 1;
+    uint8_t m_tmp[1024], *m = m_tmp;
+    size_t m_len = 0;
+
+    if (ossl_ml_dsa_key_get_priv(priv) == NULL)
+        return 0;
+    if (sig != NULL) {
+        if (sig_size < ctx->params->sig_len)
+            return 0;
+        m = msg_encode(msg, msg_len, context, context_len, encode,
+                       m_tmp, sizeof(m_tmp), &m_len);
+        if (m == NULL)
+            return 0;
+        ret = ml_dsa_sign_internal(ctx, priv, m, m_len, rand, rand_len,
+                                   sig);
+        if (m != msg && m != m_tmp)
+            OPENSSL_free(m);
+    }
+    if (sig_len != NULL)
+        *sig_len = ctx->params->sig_len;
+    return ret;
+}
+
+/**
+ * See FIPS 203 Section 5.3 Algorithm 3 ML-DSA.Verify()
+ * @returns 1 on success, or 0 on error.
+ */
+int ossl_ml_dsa_verify(ML_DSA_CTX *ctx, ML_DSA_KEY *pub,
+                       const uint8_t *msg, size_t msg_len,
+                       const uint8_t *context, size_t context_len, int encode,
+                       const uint8_t *sig, size_t sig_len)
+{
+    uint8_t *m;
+    size_t m_len;
+    uint8_t m_tmp[1024];
+    int ret = 0;
+
+    if (ossl_ml_dsa_key_get_pub(pub) == NULL
+            && !ossl_ml_dsa_key_public_from_private(pub, ctx))
+        return 0;
+
+    m = msg_encode(msg, msg_len, context, context_len, encode,
+                   m_tmp, sizeof(m_tmp), &m_len);
+    if (m == NULL)
+        return 0;
+
+    ret = ml_dsa_verify_internal(ctx, pub, m, m_len, sig, sig_len);
+    if (m != msg && m != m_tmp)
+        OPENSSL_free(m);
+    return ret;
+}
--- a/crypto/ml_dsa/ml_dsa_sign.h
+++ b/crypto/ml_dsa/ml_dsa_sign.h
@ -0,0 +1,15 @@
+/*
+ * Copyright 2024 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+struct ml_dsa_sig_st {
+    VECTOR z;
+    VECTOR hint;
+    uint8_t *c_tilde;
+    size_t c_tilde_len;
+};
--- a/crypto/ml_dsa/ml_dsa_vector.h
+++ b/crypto/ml_dsa/ml_dsa_vector.h
@ -14,7 +14,14 @@ struct vector_st {
    size_t num_poly;
 };

-/* @brief Set the number of polynomial elements that will be present in the vector */
+/**
+ * @brief Initialize a Vector object.
+ *
+ * @param v The vector to initialize.
+ * @param polys Preallocated storage for an array of Polynomials blocks. |v|
+ *              does not own/free this.
+ * @param num_polys The number of |polys| blocks (k or l)
+ */
 static ossl_inline ossl_unused
 void vector_init(VECTOR *v, POLY *polys, size_t num_polys)
 {
@ -29,36 +36,6 @@ void vector_zero(VECTOR *va)
    memset(va->poly, 0, va->num_poly * sizeof(va->poly[0]));
 }

-/* @brief add 2 vectors */
-static ossl_inline ossl_unused void
-vector_add(const VECTOR *lhs, const VECTOR *rhs, VECTOR *out)
-{
-    size_t i;
-
-    for (i = 0; i < lhs->num_poly; i++)
-        poly_add(&lhs->poly[i], &rhs->poly[i], &out->poly[i]);
-}
-
-/* @brief subtract 2 vectors */
-static ossl_inline ossl_unused void
-vector_sub(const VECTOR *lhs, const VECTOR *rhs, VECTOR *out)
-{
-    size_t i;
-
-    for (i = 0; i < lhs->num_poly; i++)
-        poly_sub(&lhs->poly[i], &rhs->poly[i], &out->poly[i]);
-}
-
-/* @brief multiply a vector by a polynomial */
-static ossl_inline ossl_unused void
-vector_ntt_mult_poly(const VECTOR *lhs, const POLY *rhs, VECTOR *out)
-{
-    size_t i;
-
-    for (i = 0; i < lhs->num_poly; i++)
-        ossl_ml_dsa_poly_ntt_mult(&lhs->poly[i], rhs, &out->poly[i]);
-}
-
 /* @brief copy a vector */
 static ossl_inline ossl_unused void
 vector_copy(VECTOR *dst, const VECTOR *src)
@ -82,6 +59,26 @@ vector_equal(const VECTOR *a, const VECTOR *b)
    return 1;
 }

+/* @brief add 2 vectors */
+static ossl_inline ossl_unused void
+vector_add(const VECTOR *lhs, const VECTOR *rhs, VECTOR *out)
+{
+    size_t i;
+
+    for (i = 0; i < lhs->num_poly; i++)
+        poly_add(lhs->poly + i, rhs->poly + i, out->poly + i);
+}
+
+/* @brief subtract 2 vectors */
+static ossl_inline ossl_unused void
+vector_sub(const VECTOR *lhs, const VECTOR *rhs, VECTOR *out)
+{
+    size_t i;
+
+    for (i = 0; i < lhs->num_poly; i++)
+        poly_sub(lhs->poly + i, rhs->poly + i, out->poly + i);
+}
+
 /* @brief convert a vector in place into NTT form */
 static ossl_inline ossl_unused void
 vector_ntt(VECTOR *va)
@ -89,7 +86,7 @@ vector_ntt(VECTOR *va)
    size_t i;

    for (i = 0; i < va->num_poly; i++)
-        ossl_ml_dsa_poly_ntt(&va->poly[i]);
+        ossl_ml_dsa_poly_ntt(va->poly + i);
 }

 /* @brief convert a vector in place into inverse NTT form */
@ -99,7 +96,54 @@ vector_ntt_inverse(VECTOR *va)
    size_t i;

    for (i = 0; i < va->num_poly; i++)
-        ossl_ml_dsa_poly_ntt_inverse(&va->poly[i]);
+        ossl_ml_dsa_poly_ntt_inverse(va->poly + i);
+}
+
+/* @brief multiply a vector by a SCALAR polynomial */
+static ossl_inline ossl_unused void
+vector_mult_scalar(const VECTOR *lhs, const POLY *rhs, VECTOR *out)
+{
+    size_t i;
+
+    for (i = 0; i < lhs->num_poly; i++)
+        ossl_ml_dsa_poly_ntt_mult(lhs->poly + i, rhs, out->poly + i);
+}
+
+static ossl_inline ossl_unused int
+vector_expand_S(EVP_MD_CTX *h_ctx, int eta, const uint8_t *seed,
+                VECTOR *s1, VECTOR *s2)
+{
+    return ossl_ml_dsa_vector_expand_S(h_ctx, eta, seed, s1, s2);
+}
+
+static ossl_inline ossl_unused void
+vector_expand_mask(VECTOR *out, const uint8_t *rho_prime, size_t rho_prime_len,
+                   uint32_t kappa, uint32_t gamma1, EVP_MD_CTX *h_ctx)
+{
+    size_t i;
+    uint8_t derived_seed[ML_DSA_RHO_PRIME_BYTES + 2];
+
+    memcpy(derived_seed, rho_prime, ML_DSA_RHO_PRIME_BYTES);
+
+    for (i = 0; i < out->num_poly; i++) {
+        size_t index = kappa + i;
+
+        derived_seed[ML_DSA_RHO_PRIME_BYTES] = index & 0xFF;
+        derived_seed[ML_DSA_RHO_PRIME_BYTES + 1] = (index >> 8) & 0xFF;
+        poly_expand_mask(out->poly + i, derived_seed, sizeof(derived_seed),
+                         gamma1, h_ctx);
+    }
+}
+
+/* Scale back previously rounded value */
+static ossl_inline ossl_unused void
+vector_scale_power2_round_ntt(const VECTOR *in, VECTOR *out)
+{
+    size_t i;
+
+    for (i = 0; i < in->num_poly; i++)
+        poly_scale_power2_round(in->poly + i, out->poly + i);
+    vector_ntt(out);
 }

 /*
@ -113,5 +157,101 @@ vector_power2_round(const VECTOR *t, VECTOR *t1, VECTOR *t0)
    size_t i;

    for (i = 0; i < t->num_poly; i++)
-        poly_power2_round(&t->poly[i], &t1->poly[i], &t0->poly[i]);
+        poly_power2_round(t->poly + i, t1->poly + i, t0->poly + i);
 }
+
+static ossl_inline ossl_unused void
+vector_high_bits(const VECTOR *in, uint32_t gamma2, VECTOR *out)
+{
+    size_t i;
+
+    for (i = 0; i < out->num_poly; i++)
+        poly_high_bits(in->poly + i, gamma2, out->poly + i);
+}
+
+static ossl_inline ossl_unused void
+vector_low_bits(const VECTOR *in, uint32_t gamma2, VECTOR *out)
+{
+    size_t i;
+
+    for (i = 0; i < out->num_poly; i++)
+        poly_low_bits(in->poly + i, gamma2, out->poly + i);
+}
+
+static ossl_inline ossl_unused uint32_t
+vector_max(const VECTOR *v)
+{
+    size_t i;
+    uint32_t mx = 0;
+
+    for (i = 0; i < v->num_poly; i++)
+        poly_max(v->poly + i, &mx);
+    return mx;
+}
+
+static ossl_inline ossl_unused uint32_t
+vector_max_signed(const VECTOR *v)
+{
+    size_t i;
+    uint32_t mx = 0;
+
+    for (i = 0; i < v->num_poly; i++)
+        poly_max_signed(v->poly + i, &mx);
+    return mx;
+}
+
+static ossl_inline ossl_unused size_t
+vector_count_ones(const VECTOR *v)
+{
+    int j;
+    size_t i, count = 0;
+
+    for (i = 0; i < v->num_poly; i++)
+        for (j = 0; j < ML_DSA_NUM_POLY_COEFFICIENTS; j++)
+            count += v->poly[i].coeff[j];
+    return count;
+}
+
+static ossl_inline ossl_unused void
+vector_make_hint(const VECTOR *ct0, const VECTOR *cs2, const VECTOR *w,
+                 uint32_t gamma2, VECTOR *out)
+{
+    size_t i;
+
+    for (i = 0; i < out->num_poly; i++)
+        poly_make_hint(ct0->poly + i, cs2->poly + i, w->poly + i, gamma2,
+                       out->poly + i);
+}
+
+static ossl_inline ossl_unused void
+vector_use_hint(const VECTOR *h, const VECTOR *r, uint32_t gamma2, VECTOR *out)
+{
+    size_t i;
+
+    for (i = 0; i < out->num_poly; i++)
+        poly_use_hint(h->poly + i, r->poly + i, gamma2, out->poly + i);
+}
+
+#if defined(ML_DSA_DEBUG)
+static ossl_inline ossl_unused void
+vector_print(const char * name, const VECTOR *v)
+{
+    size_t i;
+
+    printf("\nVECTOR %s:\n", name);
+
+    for (i = 0; i < v->num_poly; ++i)
+        poly_print(v->poly + i);
+}
+
+static ossl_inline ossl_unused void
+vector_print_signed(const char * name, const VECTOR *v)
+{
+    size_t i;
+
+    printf("\nVECTOR %s:\n", name);
+
+    for (i = 0; i < v->num_poly; ++i)
+        poly_print_signed(v->poly + i);
+}
+#endif
--- a/include/crypto/ml_dsa.h
+++ b/include/crypto/ml_dsa.h
@ -28,6 +28,7 @@ __owur int ossl_ml_dsa_key_equal(const ML_DSA_KEY *key1, const ML_DSA_KEY *key2,
                                 int selection);
 __owur int ossl_ml_dsa_key_has(const ML_DSA_KEY *key, int selection);
 __owur int ossl_ml_dsa_key_pairwise_check(const ML_DSA_KEY *key);
+__owur int ossl_ml_dsa_key_public_from_private(ML_DSA_KEY *key, ML_DSA_CTX *ctx);
 __owur int ossl_ml_dsa_key_fromdata(ML_DSA_KEY *key, const OSSL_PARAM *params,
                                    int include_private);
 __owur int ossl_ml_dsa_generate_key(ML_DSA_CTX *ctx, OSSL_LIB_CTX *libctx,
@ -52,4 +53,14 @@ __owur ML_DSA_CTX *ossl_ml_dsa_ctx_new(const char *alg,
                                       OSSL_LIB_CTX *lib_ctx, const char *propq);
 void ossl_ml_dsa_ctx_free(ML_DSA_CTX *ctx);

+__owur int ossl_ml_dsa_sign(ML_DSA_CTX *ctx, const ML_DSA_KEY *priv,
+                            const uint8_t *msg, size_t msg_len,
+                            const uint8_t *context, size_t context_len,
+                            const uint8_t *rand, size_t rand_len, int encode,
+                            unsigned char *sig, size_t *siglen, size_t sigsize);
+__owur int ossl_ml_dsa_verify(ML_DSA_CTX *ctx, ML_DSA_KEY *pub,
+                              const uint8_t *msg, size_t msg_len,
+                              const uint8_t *context, size_t context_len,
+                              int encode, const uint8_t *sig, size_t sig_len);
+
 #endif /* OSSL_CRYPTO_SLH_DSA_H */
--- a/providers/defltprov.c
+++ b/providers/defltprov.c
@ -446,6 +446,9 @@ static const OSSL_ALGORITHM deflt_signature[] = {
 # ifndef OPENSSL_NO_SM2
    { PROV_NAMES_SM2, "provider=default", ossl_sm2_signature_functions },
 # endif
+#endif
+#ifndef OPENSSL_NO_ML_DSA
+    { PROV_NAMES_ML_DSA_65, "provider=default", ossl_ml_dsa_65_signature_functions },
 #endif
    { PROV_NAMES_HMAC, "provider=default", ossl_mac_legacy_hmac_signature_functions },
    { PROV_NAMES_SIPHASH, "provider=default",
--- a/providers/implementations/include/prov/implementations.h
+++ b/providers/implementations/include/prov/implementations.h
@ -384,6 +384,7 @@ extern const OSSL_DISPATCH ossl_mac_legacy_siphash_signature_functions[];
 extern const OSSL_DISPATCH ossl_mac_legacy_poly1305_signature_functions[];
 extern const OSSL_DISPATCH ossl_mac_legacy_cmac_signature_functions[];
 extern const OSSL_DISPATCH ossl_sm2_signature_functions[];
+extern const OSSL_DISPATCH ossl_ml_dsa_65_signature_functions[];

 /* Asym Cipher */
 extern const OSSL_DISPATCH ossl_rsa_asym_cipher_functions[];
--- a/providers/implementations/signature/build.info
+++ b/providers/implementations/signature/build.info
@ -6,6 +6,7 @@ $EC_GOAL=../../libdefault.a ../../libfips.a
 $MAC_GOAL=../../libdefault.a ../../libfips.a
 $RSA_GOAL=../../libdefault.a ../../libfips.a
 $SM2_GOAL=../../libdefault.a
+$ML_DSA_GOAL=../../libdefault.a

 IF[{- !$disabled{dsa} -}]
  SOURCE[$DSA_GOAL]=dsa_sig.c
@ -31,3 +32,7 @@ DEPEND[eddsa_sig.o]=../../common/include/prov/der_ecx.h
 DEPEND[sm2_sig.o]=../../common/include/prov/der_sm2.h

 SOURCE[$MAC_GOAL]=mac_legacy_sig.c
+
+IF[{- !$disabled{ml-dsa} -}]
+  SOURCE[$DSA_GOAL]=ml_dsa_sig.c
+ENDIF
--- a/providers/implementations/signature/ml_dsa_sig.c
+++ b/providers/implementations/signature/ml_dsa_sig.c
@ -0,0 +1,247 @@
+/*
+ * Copyright 2024 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#include "internal/deprecated.h"
+
+#include <assert.h>
+#include <string.h> /* memset */
+#include <openssl/core_names.h>
+#include <openssl/err.h>
+#include <openssl/rand.h>
+#include <openssl/proverr.h>
+#include "prov/implementations.h"
+#include "prov/providercommon.h"
+#include "prov/provider_ctx.h"
+#include "crypto/ml_dsa.h"
+
+#define ML_DSA_ENTROPY_LEN 32
+
+#define ML_DSA_MESSAGE_ENCODE_RAW  0
+#define ML_DSA_MESSAGE_ENCODE_PURE 1
+
+static OSSL_FUNC_signature_sign_message_init_fn ml_dsa_sign_msg_init;
+static OSSL_FUNC_signature_sign_fn ml_dsa_sign;
+static OSSL_FUNC_signature_verify_message_init_fn ml_dsa_verify_msg_init;
+static OSSL_FUNC_signature_verify_fn ml_dsa_verify;
+static OSSL_FUNC_signature_freectx_fn ml_dsa_freectx;
+static OSSL_FUNC_signature_set_ctx_params_fn ml_dsa_set_ctx_params;
+static OSSL_FUNC_signature_settable_ctx_params_fn ml_dsa_settable_ctx_params;
+
+typedef struct {
+    ML_DSA_KEY *key;
+    ML_DSA_CTX *ctx;
+    uint8_t context_string[ML_DSA_MAX_CONTEXT_STRING_LEN];
+    size_t context_string_len;
+    uint8_t test_entropy[ML_DSA_ENTROPY_LEN];
+    size_t test_entropy_len;
+    int msg_encode;
+    int deterministic;
+    OSSL_LIB_CTX *libctx;
+    char *propq;
+} PROV_ML_DSA_CTX;
+
+static void ml_dsa_freectx(void *vctx)
+{
+    PROV_ML_DSA_CTX *ctx = (PROV_ML_DSA_CTX *)vctx;
+
+    OPENSSL_free(ctx->propq);
+    ossl_ml_dsa_ctx_free(ctx->ctx);
+    ossl_ml_dsa_key_free(ctx->key);
+    OPENSSL_cleanse(ctx->test_entropy, ctx->test_entropy_len);
+    OPENSSL_free(ctx);
+}
+
+static void *ml_dsa_newctx(void *provctx, const char *alg, const char *propq)
+{
+    PROV_ML_DSA_CTX *ctx;
+
+    if (!ossl_prov_is_running())
+        return NULL;
+
+    ctx = OPENSSL_zalloc(sizeof(PROV_ML_DSA_CTX));
+    if (ctx == NULL)
+        return NULL;
+
+    ctx->libctx = PROV_LIBCTX_OF(provctx);
+    if (propq != NULL && (ctx->propq = OPENSSL_strdup(propq)) == NULL)
+        goto err;
+    ctx->ctx = ossl_ml_dsa_ctx_new(alg, ctx->libctx, ctx->propq);
+    if (ctx->ctx == NULL)
+        goto err;
+    ctx->msg_encode = ML_DSA_MESSAGE_ENCODE_PURE;
+
+    return ctx;
+ err:
+    ml_dsa_freectx(ctx);
+    return NULL;
+}
+
+static int ml_dsa_signverify_msg_init(void *vctx, void *vkey,
+                                      const OSSL_PARAM params[], int operation,
+                                      const char *desc)
+{
+    PROV_ML_DSA_CTX *ctx = (PROV_ML_DSA_CTX *)vctx;
+    ML_DSA_KEY *key = vkey;
+
+    if (!ossl_prov_is_running()
+            || ctx == NULL)
+        return 0;
+
+    if (vkey == NULL && ctx->key == NULL) {
+        ERR_raise(ERR_LIB_PROV, PROV_R_NO_KEY_SET);
+        return 0;
+    }
+
+    if (key != NULL) {
+        if (!ossl_ml_dsa_key_type_matches(ctx->ctx, key))
+            return 0;
+        if (!ossl_ml_dsa_key_up_ref(vkey))
+            return 0;
+        ossl_ml_dsa_key_free(ctx->key);
+        ctx->key = vkey;
+    }
+
+    if (!ml_dsa_set_ctx_params(ctx, params))
+        return 0;
+    return 1;
+}
+
+static int ml_dsa_sign_msg_init(void *vctx, void *vkey, const OSSL_PARAM params[])
+{
+    return ml_dsa_signverify_msg_init(vctx, vkey, params,
+                                      EVP_PKEY_OP_SIGN, "ML_DSA Sign Init");
+}
+
+static int ml_dsa_sign(void *vctx, unsigned char *sig, size_t *siglen,
+                       size_t sigsize, const unsigned char *msg, size_t msg_len)
+{
+    int ret = 0;
+    PROV_ML_DSA_CTX *ctx = (PROV_ML_DSA_CTX *)vctx;
+    uint8_t rand_tmp[ML_DSA_ENTROPY_LEN], *rnd = NULL;
+
+    if (!ossl_prov_is_running())
+        return 0;
+
+    if (sig != NULL) {
+        if (ctx->test_entropy_len != 0) {
+            rnd = ctx->test_entropy;
+        } else {
+            rnd = rand_tmp;
+
+            if (ctx->deterministic == 1)
+                memset(rnd, 0, sizeof(rand_tmp));
+            else if (RAND_priv_bytes_ex(ctx->libctx, rnd, sizeof(rand_tmp), 0) <= 0)
+                return 0;
+        }
+    }
+    ret = ossl_ml_dsa_sign(ctx->ctx, ctx->key, msg, msg_len,
+                           ctx->context_string, ctx->context_string_len,
+                           rnd, sizeof(rand_tmp), ctx->msg_encode,
+                           sig, siglen, sigsize);
+    if (rnd != ctx->test_entropy)
+        OPENSSL_cleanse(rand_tmp, sizeof(rand_tmp));
+    return ret;
+}
+
+static int ml_dsa_verify_msg_init(void *vctx, void *vkey, const OSSL_PARAM params[])
+{
+    return ml_dsa_signverify_msg_init(vctx, vkey, params, EVP_PKEY_OP_VERIFY,
+                                   "ML_DSA Verify Init");
+}
+
+static int ml_dsa_verify(void *vctx, const unsigned char *sig, size_t siglen,
+                         const unsigned char *msg, size_t msg_len)
+{
+    PROV_ML_DSA_CTX *ctx = (PROV_ML_DSA_CTX *)vctx;
+
+    if (!ossl_prov_is_running())
+        return 0;
+    return ossl_ml_dsa_verify(ctx->ctx, ctx->key, msg, msg_len,
+                              ctx->context_string, ctx->context_string_len,
+                              ctx->msg_encode, sig, siglen);
+}
+
+static int ml_dsa_set_ctx_params(void *vctx, const OSSL_PARAM params[])
+{
+    PROV_ML_DSA_CTX *pctx = (PROV_ML_DSA_CTX *)vctx;
+    const OSSL_PARAM *p;
+
+    if (pctx == NULL)
+        return 0;
+    if (ossl_param_is_empty(params))
+        return 1;
+
+    p = OSSL_PARAM_locate_const(params, OSSL_SIGNATURE_PARAM_CONTEXT_STRING);
+    if (p != NULL) {
+        void *vp = pctx->context_string;
+
+        if (!OSSL_PARAM_get_octet_string(p, &vp, sizeof(pctx->context_string),
+                                         &(pctx->context_string_len))) {
+            pctx->context_string_len = 0;
+            return 0;
+        }
+    }
+    p = OSSL_PARAM_locate_const(params, OSSL_SIGNATURE_PARAM_TEST_ENTROPY);
+    if (p != NULL) {
+        void *vp = pctx->test_entropy;
+
+        if (!OSSL_PARAM_get_octet_string(p, &vp, sizeof(pctx->test_entropy),
+                                         &(pctx->test_entropy_len))
+                || pctx->test_entropy_len != sizeof(pctx->test_entropy)) {
+            pctx->test_entropy_len = 0;
+            return 0;
+        }
+    }
+    p = OSSL_PARAM_locate_const(params, OSSL_SIGNATURE_PARAM_DETERMINISTIC);
+    if (p != NULL && !OSSL_PARAM_get_int(p, &pctx->deterministic))
+        return 0;
+
+    p = OSSL_PARAM_locate_const(params, OSSL_SIGNATURE_PARAM_MESSAGE_ENCODING);
+    if (p != NULL && !OSSL_PARAM_get_int(p, &pctx->msg_encode))
+        return 0;
+    return 1;
+}
+
+static const OSSL_PARAM *ml_dsa_settable_ctx_params(void *vctx,
+                                                    ossl_unused void *provctx)
+{
+    static const OSSL_PARAM settable_ctx_params[] = {
+        OSSL_PARAM_octet_string(OSSL_SIGNATURE_PARAM_CONTEXT_STRING, NULL, 0),
+        OSSL_PARAM_octet_string(OSSL_SIGNATURE_PARAM_TEST_ENTROPY, NULL, 0),
+        OSSL_PARAM_int(OSSL_SIGNATURE_PARAM_DETERMINISTIC, 0),
+        OSSL_PARAM_int(OSSL_SIGNATURE_PARAM_MESSAGE_ENCODING, 0),
+        OSSL_PARAM_END
+    };
+
+    return settable_ctx_params;
+}
+
+#define MAKE_SIGNATURE_FUNCTIONS(alg, fn)                                      \
+    static OSSL_FUNC_signature_newctx_fn ml_dsa_##fn##_newctx;                 \
+    static void *ml_dsa_##fn##_newctx(void *provctx, const char *propq)        \
+    {                                                                          \
+        return ml_dsa_newctx(provctx, alg, propq);                             \
+    }                                                                          \
+    const OSSL_DISPATCH ossl_ml_dsa_##fn##_signature_functions[] = {           \
+        { OSSL_FUNC_SIGNATURE_NEWCTX, (void (*)(void))ml_dsa_##fn##_newctx },  \
+        { OSSL_FUNC_SIGNATURE_SIGN_MESSAGE_INIT,                               \
+          (void (*)(void))ml_dsa_sign_msg_init },                              \
+        { OSSL_FUNC_SIGNATURE_SIGN, (void (*)(void))ml_dsa_sign },             \
+        { OSSL_FUNC_SIGNATURE_VERIFY_MESSAGE_INIT,                             \
+          (void (*)(void))ml_dsa_verify_msg_init },                            \
+        { OSSL_FUNC_SIGNATURE_VERIFY, (void (*)(void))ml_dsa_verify },         \
+        { OSSL_FUNC_SIGNATURE_FREECTX, (void (*)(void))ml_dsa_freectx },       \
+        { OSSL_FUNC_SIGNATURE_SET_CTX_PARAMS,                                  \
+          (void (*)(void))ml_dsa_set_ctx_params },                             \
+        { OSSL_FUNC_SIGNATURE_SETTABLE_CTX_PARAMS,                             \
+          (void (*)(void))ml_dsa_settable_ctx_params },                        \
+        OSSL_DISPATCH_END                                                      \
+    }
+
+MAKE_SIGNATURE_FUNCTIONS("ML-DSA-65", 65);
--- a/test/ml_dsa.inc
+++ b/test/ml_dsa.inc
--- a/test/ml_dsa_test.c
+++ b/test/ml_dsa_test.c
@ -46,6 +46,38 @@ err:
    return pkey;
 }

+static int ml_dsa_create_keypair(EVP_PKEY **pkey, const char *name,
+                                 const uint8_t *priv, size_t priv_len,
+                                 const uint8_t *pub, size_t pub_len)
+{
+    int ret = 0, selection = 0;
+    EVP_PKEY_CTX *ctx = NULL;
+    OSSL_PARAM params[3], *p = params;
+
+    if (priv != NULL) {
+        *p++ = OSSL_PARAM_construct_octet_string(OSSL_PKEY_PARAM_PRIV_KEY,
+                                                 (uint8_t *)priv, priv_len);
+        selection = OSSL_KEYMGMT_SELECT_PRIVATE_KEY;
+    }
+    if (pub != NULL) {
+        *p++ = OSSL_PARAM_construct_octet_string(OSSL_PKEY_PARAM_PUB_KEY,
+                                                 (uint8_t *)pub, pub_len);
+        selection |= OSSL_KEYMGMT_SELECT_PUBLIC_KEY;
+    }
+    *p = OSSL_PARAM_construct_end();
+
+    if (!TEST_ptr(ctx = EVP_PKEY_CTX_new_from_name(lib_ctx, name, NULL))
+            || !TEST_int_eq(EVP_PKEY_fromdata_init(ctx), 1)
+            || !TEST_int_eq(EVP_PKEY_fromdata(ctx, pkey, selection,
+                                              params), 1))
+        goto err;
+
+    ret = 1;
+err:
+    EVP_PKEY_CTX_free(ctx);
+    return ret;
+}
+
 static int ml_dsa_keygen_test(int tst_id)
 {
    int ret = 0;
@ -72,6 +104,69 @@ err:
    return ret;
 }

+static int ml_dsa_siggen_test(int tst_id)
+{
+    int ret = 0;
+    ML_DSA_SIG_TEST_DATA *td = &ml_dsa_siggen_testdata[tst_id];
+    EVP_PKEY_CTX *sctx = NULL;
+    EVP_PKEY *pkey = NULL;
+    EVP_SIGNATURE *sig_alg = NULL;
+    OSSL_PARAM params[4], *p = params;
+    uint8_t *psig = NULL;
+    size_t psig_len = 0, sig_len2 = 0;
+    uint8_t digest[32];
+    size_t digest_len = sizeof(digest);
+    int encode = 0, deterministic = 1;
+
+    *p++ = OSSL_PARAM_construct_int(OSSL_SIGNATURE_PARAM_DETERMINISTIC, &deterministic);
+    *p++ = OSSL_PARAM_construct_int(OSSL_SIGNATURE_PARAM_MESSAGE_ENCODING, &encode);
+    if (td->add_random != NULL)
+        *p++ = OSSL_PARAM_construct_octet_string(OSSL_SIGNATURE_PARAM_TEST_ENTROPY,
+                                                 (char *)td->add_random,
+                                                 td->add_random_len);
+    *p = OSSL_PARAM_construct_end();
+
+    /*
+     * This just uses from data here, but keygen also works.
+     * The keygen path is tested via ml_dsa_keygen_test
+     */
+    if (!ml_dsa_create_keypair(&pkey, td->alg, td->priv, td->priv_len,
+                               NULL, 0))
+        goto err;
+
+    if (!TEST_ptr(sctx = EVP_PKEY_CTX_new_from_pkey(lib_ctx, pkey, NULL)))
+        goto err;
+    if (!TEST_ptr(sig_alg = EVP_SIGNATURE_fetch(lib_ctx, td->alg, NULL)))
+        goto err;
+    if (!TEST_int_eq(EVP_PKEY_sign_message_init(sctx, sig_alg, params), 1)
+            || !TEST_int_eq(EVP_PKEY_sign(sctx, NULL, &psig_len,
+                                          td->msg, td->msg_len), 1)
+            || !TEST_true(EVP_PKEY_get_size_t_param(pkey, OSSL_PKEY_PARAM_MAX_SIZE,
+                                                    &sig_len2))
+            || !TEST_int_eq(sig_len2, psig_len)
+            || !TEST_ptr(psig = OPENSSL_zalloc(psig_len))
+            || !TEST_int_eq(EVP_PKEY_sign(sctx, psig, &psig_len,
+                                          td->msg, td->msg_len), 1))
+        goto err;
+    if (!TEST_int_eq(EVP_Q_digest(lib_ctx, "SHA256", NULL, psig, psig_len,
+                                  digest, &digest_len), 1))
+        goto err;
+    if (!TEST_mem_eq(digest, digest_len, td->sig_digest, td->sig_digest_len))
+        goto err;
+
+    if (!TEST_int_eq(EVP_PKEY_verify_message_init(sctx, sig_alg, params), 1)
+            || !TEST_int_eq(EVP_PKEY_verify(sctx, psig, psig_len,
+                                            td->msg, td->msg_len), 1))
+        goto err;
+    ret = 1;
+err:
+    EVP_SIGNATURE_free(sig_alg);
+    EVP_PKEY_free(pkey);
+    EVP_PKEY_CTX_free(sctx);
+    OPENSSL_free(psig);
+    return ret;
+}
+
 const OPTIONS *test_get_options(void)
 {
    static const OPTIONS options[] = {
@ -104,6 +199,7 @@ int setup_tests(void)
        return 0;

    ADD_ALL_TESTS(ml_dsa_keygen_test, OSSL_NELEM(ml_dsa_keygen_testdata));
+    ADD_ALL_TESTS(ml_dsa_siggen_test, OSSL_NELEM(ml_dsa_siggen_testdata));
    return 1;
 }

--- a/util/perl/OpenSSL/paramnames.pm
+++ b/util/perl/OpenSSL/paramnames.pm
@ -469,6 +469,9 @@ my %params = (
    'SIGNATURE_PARAM_FIPS_SIGN_X931_PAD_CHECK' => "sign-x931-pad-check",
    'SIGNATURE_PARAM_FIPS_APPROVED_INDICATOR' => '*ALG_PARAM_FIPS_APPROVED_INDICATOR',
    'SIGNATURE_PARAM_SIGNATURE' =>          "signature",
+    'SIGNATURE_PARAM_MESSAGE_ENCODING' =>   "message-encoding",
+    'SIGNATURE_PARAM_DETERMINISTIC' =>      "deterministic",
+    'SIGNATURE_PARAM_TEST_ENTROPY' =>       "test-entropy",

 # Asym cipher parameters
    'ASYM_CIPHER_PARAM_DIGEST' =>                   '*PKEY_PARAM_DIGEST',