Browse Source

Use AVX2 for polynomial math (contributed by Shay Gueron and Fabian Schlieker)

From https://github.com/fschlieker/libntru
Tim Buktu 3 years ago
parent
commit
0adbe29c9c
3 changed files with 343 additions and 8 deletions
  1. 1 0
      LICENSE
  2. 308 4
      src/poly.c
  3. 34 4
      src/poly.h

+ 1 - 0
LICENSE

@@ -1,4 +1,5 @@
1 1
 Copyright (c) 2012, Tim Buktu
2
+Copyright (c) 2016, Shay Gueron and Fabian Schlieker
2 3
 Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
3 4
 All rights reserved.
4 5
 

+ 308 - 4
src/poly.c

@@ -3,6 +3,9 @@
3 3
 #ifdef __SSSE3__
4 4
 #include <tmmintrin.h>
5 5
 #endif
6
+#ifdef __AVX2__
7
+#include <immintrin.h>
8
+#endif
6 9
 #include "poly.h"
7 10
 #include "rand.h"
8 11
 #include "err.h"
@@ -114,7 +117,9 @@ void ntru_neg_mod(NtruIntPoly *a, uint16_t modulus) {
114 117
 }
115 118
 
116 119
 uint8_t ntru_mult_int(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
117
-#ifdef __SSSE3__
120
+#ifdef __AVX2__
121
+    return ntru_mult_int_avx2(a, b, c, mod_mask);
122
+#elif __SSSE3__
118 123
     return ntru_mult_int_sse(a, b, c, mod_mask);
119 124
 #elif _LP64
120 125
     return ntru_mult_int_64(a, b, c, mod_mask);
@@ -391,8 +396,79 @@ uint8_t ntru_mult_int_sse(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16
391 396
 }
392 397
 #endif   /* __SSSE3__ */
393 398
 
399
+#ifdef __AVX2__
400
+uint8_t ntru_mult_int_avx2(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
401
+    uint16_t N = a->N;
402
+    if (N != b->N)
403
+        return 0;
404
+    c->N = N;
405
+    int16_t c_coeffs[2*NTRU_INT_POLY_SIZE];   /* double capacity for intermediate result */
406
+    memset(&c_coeffs, 0, sizeof(c_coeffs));
407
+
408
+    uint16_t k;
409
+    for (k=N; k<NTRU_INT_POLY_SIZE; k++) {
410
+        a->coeffs[k] = 0;
411
+        b->coeffs[k] = 0;
412
+    }
413
+    for (k=0; k<N; k+=16) {
414
+        uint8_t j;
415
+
416
+        __m256i b256[8];
417
+        for (j=0; j<8; j++) {
418
+
419
+            b256[j] = _mm256_inserti128_si256(_mm256_castsi128_si256(
420
+                    _mm_set1_epi16(b->coeffs[k+j])),
421
+                    _mm_set1_epi16(b->coeffs[k+8+j]),1);
422
+        }
423
+
424
+        /* indices 0..7 */
425
+        __m128i tmp_a = _mm_lddqu_si128((__m128i*)&a->coeffs[0]);
426
+        __m256i a256 = _mm256_broadcastsi128_si256(tmp_a);
427
+
428
+        __m256i c256 = _mm256_lddqu_si256((__m256i*)&c_coeffs[k]);
429
+        for (j=0; j<8; j++) {
430
+            __m256i product = _mm256_mullo_epi16(a256, b256[j]);
431
+            c256 = _mm256_add_epi16(c256, product);
432
+            a256 = _mm256_bslli_epi128(a256, 2);
433
+        }
434
+        _mm256_storeu_si256((__m256i*)&c_coeffs[k], c256);
435
+
436
+        /* indices 8... */
437
+        uint16_t i;
438
+        for (i=8; i<N+8; i+=8) {
439
+            __m256i c256 = _mm256_lddqu_si256((__m256i*)&c_coeffs[k+i]);
440
+
441
+            __m128i tmp_0 = _mm_lddqu_si128((__m128i*)&a->coeffs[i-7]);
442
+            __m256i a256_0 = _mm256_broadcastsi128_si256(tmp_0);
443
+
444
+            __m128i tmp_1 = _mm_lddqu_si128((__m128i*)&a->coeffs[i]);
445
+            __m256i a256_1 = _mm256_broadcastsi128_si256(tmp_1);
446
+
447
+
448
+            for (j=0; j<8; j++) {
449
+                __m256i product = _mm256_mullo_epi16(a256_1, b256[j]);
450
+                c256 = _mm256_add_epi16(c256, product);
451
+
452
+                a256_0 = _mm256_bslli_epi128(a256_0, 2);
453
+                a256_1 = _mm256_alignr_epi8(a256_1, a256_0, 14);
454
+            }
455
+            _mm256_storeu_si256((__m256i*)&c_coeffs[k+i], c256);
456
+        }
457
+    }
458
+
459
+    /* no need to SSE-ify the following loop b/c the compiler auto-vectorizes it */
460
+    for (k=0; k<N; k++)
461
+        c->coeffs[k] = c_coeffs[k] + c_coeffs[N+k];
462
+
463
+    ntru_mod_mask(c, mod_mask);
464
+    return 1;
465
+}
466
+#endif   /* __AVX2__ */
467
+
394 468
 uint8_t ntru_mult_tern(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
395
-#ifdef __SSSE3__
469
+#ifdef __AVX2__
470
+    return ntru_mult_tern_avx2(a, b, c, mod_mask);
471
+#elif __SSSE3__
396 472
     return ntru_mult_tern_sse(a, b, c, mod_mask);
397 473
 #elif _LP64
398 474
     return ntru_mult_tern_64(a, b, c, mod_mask);
@@ -709,6 +785,169 @@ uint8_t ntru_mult_tern_sse(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint
709 785
 }
710 786
 #endif   /* __SSSE3__ */
711 787
 
788
+#ifdef __AVX2__
789
+/* Optimized for small df */
790
+uint8_t ntru_mult_tern_avx2_sparse(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
791
+    uint16_t N = a->N;
792
+    if (N != b->N)
793
+        return 0;
794
+    memset(&c->coeffs, 0, N * sizeof c->coeffs[0]);
795
+    c->N = N;
796
+
797
+    /* add coefficients that are multiplied by 1 */
798
+    uint16_t i;
799
+    for (i=0; i<b->num_ones; i++) {
800
+        int16_t j;
801
+        int16_t k = b->ones[i];
802
+        uint16_t j_end = N<b->ones[i] ? 0 : N-b->ones[i];
803
+        /* it is safe not to truncate the last block of 8 coefficients */
804
+        /* because there is extra room at the end of the coeffs array  */
805
+        for (j=0; j<j_end; j+=16,k+=16) {
806
+            __m256i ck = _mm256_lddqu_si256((__m256i*)&c->coeffs[k]);
807
+            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
808
+            __m256i ca = _mm256_add_epi16(ck, aj);
809
+            _mm256_storeu_si256((__m256i*)&c->coeffs[k], ca);
810
+        }
811
+        j = j_end;
812
+        for (k=0; j<N-15; j+=16,k+=16) {
813
+            __m256i ck = _mm256_lddqu_si256((__m256i*)&c->coeffs[k]);
814
+            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
815
+            __m256i ca = _mm256_add_epi16(ck, aj);
816
+            _mm256_storeu_si256((__m256i*)&c->coeffs[k], ca);
817
+        }
818
+        for (; j<N; j++,k++)
819
+            c->coeffs[k] += a->coeffs[j];
820
+    }
821
+    /* subtract coefficients that are multiplied by -1 */
822
+    for (i=0; i<b->num_neg_ones; i++) {
823
+        int16_t j;
824
+        int16_t k = b->neg_ones[i];
825
+        uint16_t j_end = N<b->neg_ones[i] ? 0 : N-b->neg_ones[i];
826
+        /* it is safe not to truncate the last block of 8 coefficients */
827
+        /* because there is extra room at the end of the coeffs array  */
828
+        for (j=0; j<j_end; j+=16,k+=16) {
829
+            __m256i ck = _mm256_lddqu_si256((__m256i*)&c->coeffs[k]);
830
+            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
831
+            __m256i ca = _mm256_sub_epi16(ck, aj);
832
+            _mm256_storeu_si256((__m256i*)&c->coeffs[k], ca);
833
+        }
834
+        j = j_end;
835
+        for (k=0; j<N-15; j+=16,k+=16) {
836
+            __m256i ck = _mm256_lddqu_si256((__m256i*)&c->coeffs[k]);
837
+            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
838
+            __m256i ca = _mm256_sub_epi16(ck, aj);
839
+            _mm256_storeu_si256((__m256i*)&c->coeffs[k], ca);
840
+        }
841
+        for (; j<N; j++,k++)
842
+            c->coeffs[k] -= a->coeffs[j];
843
+    }
844
+
845
+    ntru_mod_mask(c, mod_mask);
846
+    return 1;
847
+}
848
+
849
+/* Optimized for large df */
850
+uint8_t ntru_mult_tern_avx2_dense(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
851
+    uint16_t N = a->N;
852
+    if (N != b->N)
853
+        return 0;
854
+    c->N = N;
855
+
856
+    uint16_t i;
857
+    for (i=N; i<NTRU_INT_POLY_SIZE; i++)
858
+        a->coeffs[i] = 0;
859
+    int16_t c_coeffs_arr[16+2*NTRU_INT_POLY_SIZE];   /* double capacity for intermediate result + another 8 */
860
+    int16_t *c_coeffs = c_coeffs_arr + 16;
861
+    memset(&c_coeffs_arr, 0, sizeof(c_coeffs_arr));
862
+
863
+    __m256i a_coeffs0[16];
864
+    a_coeffs0[0] = _mm256_lddqu_si256((__m256i*)&a->coeffs[0]);
865
+
866
+    for (i=1; i<16; i++) {
867
+        /* Emulate the SSE full-register shifting behaviour in AVX2 (the  */
868
+        /* corresponding _mm256_slli_si256 instruction shifts the two */
869
+        /* 128-bit lanes independently instead of the whole register). */
870
+        /* Two AVX2 instructions are needed for this. */
871
+        __m256i mask = _mm256_permute2x128_si256(a_coeffs0[i-1], a_coeffs0[i-1], _MM_SHUFFLE(0,0,2,0) );
872
+        a_coeffs0[i] = _mm256_alignr_epi8(a_coeffs0[i-1],mask,14);
873
+    }
874
+
875
+    /* add coefficients that are multiplied by 1 */
876
+    for (i=0; i<b->num_ones; i++) {
877
+        int16_t k = b->ones[i];
878
+        /* process the first num_coeffs0 coefficients, 1<=num_coeffs0<=8 */
879
+        uint8_t num_bytes0 = 32 - (((size_t)&c_coeffs[k])%32);
880
+        uint8_t num_coeffs0 = num_bytes0 / 2;   /* c_coeffs[k+num_coeffs0] is 32-byte aligned */
881
+        k -= 16 - num_coeffs0;
882
+        __m256i *ck = (__m256i*)&c_coeffs[k];
883
+        __m256i aj = a_coeffs0[16-num_coeffs0];
884
+        __m256i ca = _mm256_add_epi16(*ck, aj);
885
+        _mm256_store_si256(ck, ca);
886
+        k += 16;
887
+        /* process the remaining coefficients in blocks of 16. */
888
+        /* it is safe not to truncate the last block of 16 coefficients */
889
+        /* because there is extra room at the end of the coeffs array  */
890
+        ck = (__m256i*)&c_coeffs[k];
891
+        int16_t j;
892
+        for (j=num_coeffs0; j<N; j+=16,k+=16) {
893
+            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
894
+            __m256i ca = _mm256_add_epi16(*ck, aj);
895
+            _mm256_store_si256(ck, ca);
896
+            ck++;
897
+        }
898
+    }
899
+
900
+    /* subtract coefficients that are multiplied by -1 */
901
+    for (i=0; i<b->num_neg_ones; i++) {
902
+        int16_t k = b->neg_ones[i];
903
+        /* process the first num_coeffs0 coefficients, 1<=num_coeffs0<=8 */
904
+        uint8_t num_bytes0 = 32 - (((size_t)&c_coeffs[k])%32);
905
+        uint8_t num_coeffs0 = num_bytes0 / 2;   /* c_coeffs[k+num_coeffs0] is 32-byte aligned */
906
+        k -= 16 - num_coeffs0;
907
+        __m256i *ck = (__m256i*)&c_coeffs[k];
908
+        __m256i aj = a_coeffs0[16-num_coeffs0];
909
+        __m256i ca = _mm256_sub_epi16(*ck, aj);
910
+        _mm256_store_si256(ck, ca);
911
+        k += 16;
912
+        /* process the remaining coefficients in blocks of 16. */
913
+        /* it is safe not to truncate the last block of 16 coefficients */
914
+        /* because there is extra room at the end of the coeffs array  */
915
+        ck = (__m256i*)&c_coeffs[k];
916
+        int16_t j;
917
+        for (j=num_coeffs0; j<N; j+=16,k+=16) {
918
+            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
919
+            __m256i ca = _mm256_sub_epi16(*ck, aj);
920
+            _mm256_store_si256(ck, ca);
921
+            ck++;
922
+        }
923
+    }
924
+
925
+    /* reduce c_coeffs[0..2N-1] to [0..N-1] and apply mod_mask to reduce values mod q */
926
+    /* handle the first coefficients individually if c_coeffs is not 16-byte aligned */
927
+    for (i=0; ((size_t)&c_coeffs[i])%32; i++)
928
+        c->coeffs[i] = (c_coeffs[i] + c_coeffs[N+i]) & mod_mask;
929
+    /* handle the remaining ones in blocks of 16 */
930
+    __m256i mod_mask_256 = _mm256_set1_epi16(mod_mask);
931
+    __m256i *ci = (__m256i*)(&c_coeffs[i]);
932
+    for (; i<N; i+=16) {
933
+        __m256i c256_1 = _mm256_lddqu_si256((__m256i*)&c_coeffs[i+N]);
934
+        __m256i c256_0 = _mm256_add_epi16(*ci, c256_1);
935
+        c256_0 = _mm256_and_si256(c256_0, mod_mask_256);
936
+        _mm256_storeu_si256((__m256i*)&c->coeffs[i], c256_0);
937
+        ci++;
938
+    }
939
+
940
+    return 1;
941
+}
942
+
943
+uint8_t ntru_mult_tern_avx2(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
944
+    if (b->num_ones<NTRU_SPARSE_THRESH && b->num_neg_ones<NTRU_SPARSE_THRESH)
945
+        return ntru_mult_tern_avx2_sparse(a, b, c, mod_mask);
946
+    else
947
+        return ntru_mult_tern_avx2_dense(a, b, c, mod_mask);
948
+}
949
+#endif   /* __AVX2__ */
950
+
712 951
 #ifndef NTRU_AVOID_HAMMING_WT_PATENT
713 952
 uint8_t ntru_mult_prod(NtruIntPoly *a, NtruProdPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
714 953
     uint16_t N = a->N;
@@ -1132,6 +1371,19 @@ void ntru_mod_sse(NtruIntPoly *p, uint16_t mod_mask) {
1132 1371
 }
1133 1372
 #endif
1134 1373
 
1374
+#ifdef __AVX2__
1375
+void ntru_mod_avx2(NtruIntPoly *p, uint16_t mod_mask) {
1376
+    uint16_t i;
1377
+    __m256i mod_mask_256 = _mm256_set1_epi16(mod_mask);
1378
+
1379
+    for (i=0; i<p->N; i+=16) {
1380
+        __m256i a = _mm256_lddqu_si256((__m256i*)&p->coeffs[i]);
1381
+        a = _mm256_and_si256(a, mod_mask_256);
1382
+        _mm256_storeu_si256((__m256i*)&p->coeffs[i], a);
1383
+    }
1384
+}
1385
+#endif   /* __AVX2__ */
1386
+
1135 1387
 void ntru_mod_64(NtruIntPoly *p, uint16_t mod_mask) {
1136 1388
     typedef uint64_t __attribute__((__may_alias__)) uint64_t_alias;
1137 1389
     uint64_t mod_mask_64 = mod_mask;
@@ -1152,7 +1404,9 @@ void ntru_mod_32(NtruIntPoly *p, uint16_t modulus) {
1152 1404
 }
1153 1405
 
1154 1406
 void ntru_mod_mask(NtruIntPoly *p, uint16_t mod_mask) {
1155
-#ifdef __SSSE3__
1407
+#ifdef __AVX2__
1408
+    ntru_mod_avx2(p, mod_mask);
1409
+#elif __SSSE3__
1156 1410
     ntru_mod_sse(p, mod_mask);
1157 1411
 #elif _LP64
1158 1412
     ntru_mod_64(p, mod_mask);
@@ -1227,8 +1481,58 @@ void ntru_mod3_sse(NtruIntPoly *p) {
1227 1481
 }
1228 1482
 #endif   /* __SSSE3__ */
1229 1483
 
1484
+#ifdef __AVX2__
1485
+__m256i NTRU_MOD3_LUT_AVX = {0x0403050403050403, 0, 0x0403050403050403, 0};
1486
+
1487
+void ntru_mod3_avx2(NtruIntPoly *p) {
1488
+    uint16_t i;
1489
+    for (i=0; i<(p->N+15)/16*16; i+=16) {
1490
+        __m256i a = _mm256_lddqu_si256((__m256i*)&p->coeffs[i]);
1491
+
1492
+        /* make positive */
1493
+        __m256i _3000 = _mm256_set1_epi16(3000);
1494
+        a = _mm256_add_epi16(a, _3000);
1495
+
1496
+        /* a = (a>>8) + (a&0xFF);  (sum base 2**8 digits) */
1497
+        __m256i a1 = _mm256_srli_epi16(a, 8);
1498
+        __m256i mask = _mm256_set1_epi16(0x00FF);
1499
+        __m256i a2 = _mm256_and_si256(a, mask);
1500
+        a = _mm256_add_epi16(a1, a2);
1501
+
1502
+        /* a = (a>>4) + (a&0xF);  (sum base 2**4 digits; worst case 0x3B) */
1503
+        a1 = _mm256_srli_epi16(a, 4);
1504
+        mask = _mm256_set1_epi16(0x000F);
1505
+        a2 = _mm256_and_si256(a, mask);
1506
+        a = _mm256_add_epi16(a1, a2);
1507
+        /* a = (a>>2) + (a&0x3);  (sum base 2**2 digits; worst case 0x1B) */
1508
+        a1 = _mm256_srli_epi16(a, 2);
1509
+        mask = _mm256_set1_epi16(0x0003);
1510
+        a2 = _mm256_and_si256(a, mask);
1511
+        a = _mm256_add_epi16(a1, a2);
1512
+
1513
+        /* a = (a>>2) + (a&0x3);  (sum base 2**2 digits; worst case 0x7) */
1514
+        a1 = _mm256_srli_epi16(a, 2);
1515
+        mask = _mm256_set1_epi16(0x0003);
1516
+        a2 = _mm256_and_si256(a, mask);
1517
+        a = _mm256_add_epi16(a1, a2);
1518
+
1519
+        __m256i a_mod3 = _mm256_shuffle_epi8(NTRU_MOD3_LUT_AVX, a);
1520
+        /* _mm256_shuffle_epi8 changed bytes 1, 3, 5, ... to non-zero; change them back to zero */
1521
+        mask = _mm256_set1_epi16(0x00FF);
1522
+        a_mod3 = _mm256_and_si256(a_mod3, mask);
1523
+        /* subtract 3 so coefficients are in the 0..2 range */
1524
+        __m256i three = _mm256_set1_epi16(0x0003);
1525
+        a_mod3 = _mm256_sub_epi16(a_mod3, three);
1526
+
1527
+        _mm256_storeu_si256((__m256i*)&p->coeffs[i], a_mod3);
1528
+    }
1529
+}
1530
+#endif   /* __AVX2__ */
1531
+
1230 1532
 void ntru_mod3(NtruIntPoly *p) {
1231
-#ifdef __SSSE3__
1533
+#ifdef __AVX2__
1534
+    ntru_mod3_avx2(p);
1535
+#elif __SSSE3__
1232 1536
     ntru_mod3_sse(p);
1233 1537
 #else
1234 1538
     ntru_mod3_standard(p);

+ 34 - 4
src/poly.h

@@ -74,7 +74,7 @@ void ntru_sub(NtruIntPoly *a, NtruIntPoly *b);
74 74
 uint8_t ntru_mult_tern(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask);
75 75
 
76 76
 /**
77
- * @brief General polynomial by ternary polynomial multiplication
77
+ * @brief General polynomial by ternary polynomial multiplication, 32 bit version
78 78
  *
79 79
  * Multiplies a NtruIntPoly by a NtruTernPoly. The number of coefficients
80 80
  * must be the same for both polynomials.
@@ -89,7 +89,7 @@ uint8_t ntru_mult_tern(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t
89 89
 uint8_t ntru_mult_tern_32(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask);
90 90
 
91 91
 /**
92
- * @brief General polynomial by ternary polynomial multiplication
92
+ * @brief General polynomial by ternary polynomial multiplication, 64 bit version
93 93
  *
94 94
  * Multiplies a NtruIntPoly by a NtruTernPoly. The number of coefficients
95 95
  * must be the same for both polynomials.
@@ -118,6 +118,21 @@ uint8_t ntru_mult_tern_64(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint1
118 118
  */
119 119
 uint8_t ntru_mult_tern_sse(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask);
120 120
 
121
+/**
122
+ * @brief General polynomial by ternary polynomial multiplication, AVX2 version
123
+ *
124
+ * Multiplies a NtruIntPoly by a NtruTernPoly. The number of coefficients
125
+ * must be the same for both polynomials.
126
+ * This variant requires AVX2 support.
127
+ *
128
+ * @param a a general polynomial
129
+ * @param b a ternary polynomial
130
+ * @param c output parameter; a pointer to store the new polynomial
131
+ * @param mod_mask an AND mask to apply; must be a power of two minus one
132
+ * @return 0 if the number of coefficients differ, 1 otherwise
133
+ */
134
+uint8_t ntru_mult_tern_avx2(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask);
135
+
121 136
 #ifndef NTRU_AVOID_HAMMING_WT_PATENT
122 137
 /**
123 138
  * @brief General polynomial by product-form polynomial multiplication
@@ -255,7 +270,7 @@ uint8_t ntru_mult_int(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t m
255 270
 uint8_t ntru_mult_int_16(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask);
256 271
 
257 272
 /**
258
- * @brief Multiplication of two general polynomials with a modulus
273
+ * @brief Multiplication of two general polynomials with a modulus, 64 bit version
259 274
  *
260 275
  * Multiplies a NtruIntPoly by another, taking the coefficient values modulo an integer.
261 276
  * The number of coefficients must be the same for both polynomials.
@@ -270,7 +285,7 @@ uint8_t ntru_mult_int_16(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_
270 285
 uint8_t ntru_mult_int_64(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask);
271 286
 
272 287
 /**
273
- * @brief Multiplication of two general polynomials with a modulus
288
+ * @brief Multiplication of two general polynomials with a modulus, SSSE3 version
274 289
  *
275 290
  * Multiplies a NtruIntPoly by another, taking the coefficient values modulo an integer.
276 291
  * The number of coefficients must be the same for both polynomials.
@@ -285,6 +300,21 @@ uint8_t ntru_mult_int_64(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_
285 300
 uint8_t ntru_mult_int_sse(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask);
286 301
 
287 302
 /**
303
+ * @brief Multiplication of two general polynomials with a modulus, AVX2 version
304
+ *
305
+ * Multiplies a NtruIntPoly by another, taking the coefficient values modulo an integer.
306
+ * The number of coefficients must be the same for both polynomials.
307
+ * Requires AVX2 support.
308
+ *
309
+ * @param a input and output parameter; coefficients are overwritten
310
+ * @param b a polynomial to multiply by
311
+ * @param c output parameter; a pointer to store the new polynomial
312
+ * @param mod_mask an AND mask to apply to the coefficients of c
313
+ * @return 0 if the number of coefficients differ, 1 otherwise
314
+ */
315
+uint8_t ntru_mult_int_avx2(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask);
316
+
317
+/**
288 318
  * @brief Reduction modulo a power of two
289 319
  *
290 320
  * Reduces the coefficients of an NtruIntPoly modulo a power of two.