Browse Source

Runtime detection of SSSE3 and AVX2

Tim Buktu 2 years ago
parent
commit
6fab0408c5
19 changed files with 1671 additions and 1425 deletions
  1. 40 32
      Makefile.bsd
  2. 43 32
      Makefile.linux
  3. 37 10
      Makefile.os2
  4. 36 30
      Makefile.osx
  5. 38 10
      Makefile.win
  6. 3 6
      README.md
  7. 68 516
      src/hash.c
  8. 8 0
      src/hash.h
  9. 513 0
      src/hash_simd.c
  10. 14 0
      src/hash_simd.h
  11. 18 0
      src/ntru.c
  12. 81 711
      src/poly.c
  13. 13 78
      src/poly.h
  14. 293 0
      src/poly_avx2.c
  15. 45 0
      src/poly_avx2.h
  16. 357 0
      src/poly_ssse3.c
  17. 60 0
      src/poly_ssse3.h
  18. 3 0
      tests/test.c
  19. 1 0
      tests/test_poly.c

+ 40 - 32
Makefile.bsd

@@ -2,43 +2,41 @@ CC?=cc
2 2
 AS=$(CC) -c
3 3
 AR?=ar
4 4
 
5
-CFLAGS?=-g
6
-CFLAGS+=-Wall -Wextra -Wno-unused-parameter
7
-SSSE3_FLAG = $(shell /usr/bin/grep -o SSSE3 /var/run/dmesg.boot | /usr/bin/head -1)
8
-ifneq ($(SSE), no)
9
-    ifeq ($(SSSE3_FLAG), SSSE3)
10
-        SSE=yes
11
-    endif
5
+# are we running on an X86 machine?
6
+X86_ARCH=no
7
+MACHINE=$(shell uname -m | sed 's/i.86/i386/g')
8
+ifeq ($(MACHINE), i386)
9
+    X86_ARCH=yes
12 10
 endif
13
-AVX2_FLAG = $(shell /usr/bin/grep -o AVX2 /var/run/dmesg.boot | /usr/bin/head -1)
14
-ifneq ($(AVX2), no)
15
-    ifeq ($(AVX2_FLAG), AVX2)
16
-        AVX2=yes
17
-    endif
11
+ifeq ($(MACHINE), amd64)
12
+    X86_ARCH=yes
18 13
 endif
19
-ifeq ($(AVX2), yes)
20
-    SSE=yes
14
+
15
+# SIMD is only supported on X86
16
+ifeq ($(X86_ARCH), no)
17
+    SIMD=none
21 18
 endif
22
-ifeq ($(SSE), no)
23
-    AVX2=no
19
+
20
+ifeq ($(SIMD), sse)
21
+    SIMD=ssse3
24 22
 endif
25
-ifeq ($(SSE), yes)
23
+
24
+# set CFLAGS depending on SIMD
25
+CFLAGS?=-g
26
+CFLAGS+=-Wall -Wextra -Wno-unused-parameter
27
+ifeq ($(SIMD), ssse3)
26 28
     CFLAGS+=-mssse3
27
-endif
28
-ifeq ($(AVX2), yes)
29
-    CFLAGS+=-mavx2
29
+else ifeq ($(SIMD), avx2)
30
+    CFLAGS+=-mssse3 -mavx2
31
+else ifneq ($(SIMD), none)
32
+    SIMD=auto
33
+    CFLAGS+=-DNTRU_DETECT_SIMD
30 34
 endif
31 35
 
32
-# use -march=native if we're compiling for x86
36
+# use -march=native if we're compiling 'bench' for x86 and SIMD=auto
33 37
 BENCH_ARCH_OPTION=
34
-MACHINE=$(shell uname -m | sed 's/i.86/i386/g')
35
-ifeq ($(SSE), yes)
36
-    ifeq ($(MACHINE), i386)
37
-        BENCH_ARCH_OPTION=-march=native
38
-    endif
39
-    ifeq ($(MACHINE), amd64)
40
-        BENCH_ARCH_OPTION=-march=native
41
-    endif
38
+ifeq ($(SIMD), auto)
39
+    BENCH_ARCH_OPTION=-march=native
42 40
 endif
43 41
 OPTFLAGS=-O2
44 42
 bench: OPTFLAGS=-O3 $(BENCH_ARCH_OPTION)
@@ -50,9 +48,10 @@ endif
50 48
 SRCDIR=src
51 49
 TESTDIR=tests
52 50
 LIB_OBJS=bitstring.o encparams.o hash.o idxgen.o key.o mgf.o ntru.o poly.o rand.o arith.o sha1.o sha2.o nist_ctr_drbg.o rijndael.o
53
-ifeq ($(SSE), yes)
54
-    ifeq ($(MACHINE), amd64)
55
-        LIB_OBJS+=sha1-mb-x86_64.o sha256-mb-x86_64.o
51
+ifneq ($(SIMD), none)
52
+    LIB_OBJS+=sha1-mb-x86_64.o sha256-mb-x86_64.o hash_simd.o poly_ssse3.o
53
+    ifneq ($(SIMD), ssse3)
54
+        LIB_OBJS+=poly_avx2.o
56 55
     endif
57 56
 endif
58 57
 TEST_OBJS=test_bitstring.o test_hash.o test_idxgen.o test_key.o test_ntru.o test.o test_poly.o test_util.o
@@ -163,6 +162,15 @@ hybrid: static-lib
163 162
 $(SRCDIR)/%.o: $(SRCDIR)/%.c
164 163
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c -fPIC $< -o $@
165 164
 
165
+$(SRCDIR)/poly_ssse3.o: $(SRCDIR)/poly_ssse3.c
166
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mssse3 -c -fPIC $(SRCDIR)/poly_ssse3.c -o $(SRCDIR)/poly_ssse3.o
167
+
168
+$(SRCDIR)/poly_avx2.o: $(SRCDIR)/poly_avx2.c
169
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mavx2 -c -fPIC $(SRCDIR)/poly_avx2.c -o $(SRCDIR)/poly_avx2.o
170
+
171
+$(SRCDIR)/hash_simd.o: $(SRCDIR)/hash_simd.c
172
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mssse3 -c -fPIC $(SRCDIR)/hash_simd.c -o $(SRCDIR)/hash_simd.o
173
+
166 174
 $(SRCDIR)/sha1-mb-x86_64.s: $(SRCDIR)/sha1-mb-x86_64.pl; CC=$(CC) ASM="$(AS)" $(PERL) $(SRCDIR)/sha1-mb-x86_64.pl $(PERLASM_SCHEME) > $@
167 175
 $(SRCDIR)/sha1-mb-x86_64.o: $(SRCDIR)/sha1-mb-x86_64.s
168 176
 	$(AS) $(SRCDIR)/sha1-mb-x86_64.s -o $@

+ 43 - 32
Makefile.linux

@@ -2,43 +2,44 @@ CC?=gcc
2 2
 AS=$(CC) -c
3 3
 AR?=ar
4 4
 
5
-CFLAGS?=-g
6
-CFLAGS+=-Wall -Wextra -Wno-unused-parameter
7
-SSSE3_FLAG = $(shell /bin/grep -m 1 -o ssse3 /proc/cpuinfo)
8
-ifneq ($(SSE), no)
9
-    ifeq ($(SSSE3_FLAG), ssse3)
10
-        SSE=yes
11
-    endif
5
+# are we running on an X86 machine?
6
+X86_ARCH=no
7
+MACHINE=$(shell uname -m | sed 's/i.86/i386/g')
8
+ifeq ($(MACHINE), i386)
9
+    X86_ARCH=yes
12 10
 endif
13
-AVX2_FLAG = $(shell /bin/grep -m 1 -o avx2 /proc/cpuinfo)
14
-ifneq ($(AVX2), no)
15
-    ifeq ($(AVX2_FLAG), avx2)
16
-        AVX2=yes
17
-    endif
11
+ifeq ($(MACHINE), x86_64)
12
+    X86_ARCH=yes
18 13
 endif
19
-ifeq ($(AVX2), yes)
20
-    SSE=yes
14
+
15
+# SIMD is only supported on X86
16
+ifeq ($(X86_ARCH), no)
17
+    SIMD=none
21 18
 endif
22
-ifeq ($(SSE), no)
23
-    AVX2=no
19
+
20
+ifeq ($(SIMD), sse)
21
+    SIMD=ssse3
24 22
 endif
25
-ifeq ($(SSE), yes)
23
+
24
+# set CFLAGS depending on SIMD
25
+CFLAGS?=-g
26
+CFLAGS+=-Wall -Wextra -Wno-unused-parameter
27
+ifeq ($(SIMD), ssse3)
26 28
     CFLAGS+=-mssse3
27
-endif
28
-ifeq ($(AVX2), yes)
29
-    CFLAGS+=-mavx2
29
+else ifeq ($(SIMD), avx2)
30
+    CFLAGS+=-mssse3 -mavx2
31
+else ifneq ($(SIMD), none)
32
+    SIMD=auto
33
+    CFLAGS+=-DNTRU_DETECT_SIMD
30 34
 endif
31 35
 
32
-# use -march=native if we're compiling for x86
36
+# use -march=native if we're compiling 'bench' for x86 and SIMD={auto,avx2}
33 37
 BENCH_ARCH_OPTION=
34
-MACHINE=$(shell uname -m | sed 's/i.86/i386/g')
35
-ifeq ($(SSE), yes)
36
-    ifeq ($(MACHINE), i386)
37
-        BENCH_ARCH_OPTION=-march=native
38
-    endif
39
-    ifeq ($(MACHINE), x86_64)
40
-        BENCH_ARCH_OPTION=-march=native
41
-    endif
38
+ifeq ($(SIMD), auto)
39
+    BENCH_ARCH_OPTION=-march=native
40
+endif
41
+ifeq ($(SIMD), avx2)
42
+    BENCH_ARCH_OPTION=-march=native
42 43
 endif
43 44
 OPTFLAGS=-O2
44 45
 bench: OPTFLAGS=-O3 $(BENCH_ARCH_OPTION)
@@ -48,9 +49,10 @@ LIBS+=-lrt
48 49
 SRCDIR=src
49 50
 TESTDIR=tests
50 51
 LIB_OBJS=bitstring.o encparams.o hash.o idxgen.o key.o mgf.o ntru.o poly.o rand.o arith.o sha1.o sha2.o nist_ctr_drbg.o rijndael.o
51
-ifeq ($(SSE), yes)
52
-    ifeq ($(MACHINE), x86_64)
53
-        LIB_OBJS+=sha1-mb-x86_64.o sha256-mb-x86_64.o
52
+ifneq ($(SIMD), none)
53
+    LIB_OBJS+=sha1-mb-x86_64.o sha256-mb-x86_64.o hash_simd.o poly_ssse3.o
54
+    ifneq ($(SIMD), ssse3)
55
+        LIB_OBJS+=poly_avx2.o
54 56
     endif
55 57
 endif
56 58
 TEST_OBJS=test_bitstring.o test_hash.o test_idxgen.o test_key.o test_ntru.o test.o test_poly.o test_util.o
@@ -161,6 +163,15 @@ hybrid: static-lib
161 163
 $(SRCDIR)/%.o: $(SRCDIR)/%.c
162 164
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c -fPIC $< -o $@
163 165
 
166
+$(SRCDIR)/poly_ssse3.o: $(SRCDIR)/poly_ssse3.c
167
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mssse3 -c -fPIC $(SRCDIR)/poly_ssse3.c -o $(SRCDIR)/poly_ssse3.o
168
+
169
+$(SRCDIR)/poly_avx2.o: $(SRCDIR)/poly_avx2.c
170
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mavx2 -c -fPIC $(SRCDIR)/poly_avx2.c -o $(SRCDIR)/poly_avx2.o
171
+
172
+$(SRCDIR)/hash_simd.o: $(SRCDIR)/hash_simd.c
173
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mssse3 -c -fPIC $(SRCDIR)/hash_simd.c -o $(SRCDIR)/hash_simd.o
174
+
164 175
 $(SRCDIR)/sha1-mb-x86_64.s: $(SRCDIR)/sha1-mb-x86_64.pl; CC=$(CC) ASM="$(AS)" $(PERL) $(SRCDIR)/sha1-mb-x86_64.pl $(PERLASM_SCHEME) > $@
165 176
 $(SRCDIR)/sha1-mb-x86_64.o: $(SRCDIR)/sha1-mb-x86_64.s
166 177
 	$(AS) $(SRCDIR)/sha1-mb-x86_64.s -o $@

+ 37 - 10
Makefile.os2

@@ -1,22 +1,40 @@
1 1
 CC?=gcc
2 2
 AS=$(CC) -c
3
-OPTFLAGS=-O2
4
-bench: OPTFLAGS=-O3 -march=native
5
-CFLAGS?=-g $(OPTFLAGS)
6
-CFLAGS+=-Wall -Wextra -Wno-unused-parameter
7
-ifeq ($(AVX2), yes)
8
-    CFLAGS+=-mavx2
9
-    SSE=yes
3
+
4
+ifeq ($(SIMD), sse)
5
+    SIMD=ssse3
10 6
 endif
11
-ifeq ($(SSE), yes)
7
+
8
+# set CFLAGS depending on SIMD
9
+CFLAGS?=-g
10
+CFLAGS+=-Wall -Wextra -Wno-unused-parameter
11
+ifeq ($(SIMD), ssse3)
12 12
     CFLAGS+=-mssse3
13
+else ifeq ($(SIMD), avx2)
14
+    CFLAGS+=-mssse3 -mavx2
15
+else ifneq ($(SIMD), none)
16
+    SIMD=auto
17
+    CFLAGS+=-DNTRU_DETECT_SIMD
18
+endif
19
+
20
+# use -march=native if SIMD=auto
21
+BENCH_ARCH_OPTION=
22
+ifeq ($(SIMD), auto)
23
+    BENCH_ARCH_OPTION=-march=native
13 24
 endif
25
+OPTFLAGS=-O2
26
+bench: OPTFLAGS=-O3 $(BENCH_ARCH_OPTION)
27
+CFLAGS+=$(OPTFLAGS)
28
+
14 29
 LIBS+=-lrt
15 30
 SRCDIR=src
16 31
 TESTDIR=tests
17 32
 LIB_OBJS=bitstring.o encparams.o hash.o idxgen.o key.o mgf.o ntru.o poly.o rand.o arith.o sha1.o sha2.o nist_ctr_drbg.o rijndael.o
18
-ifeq ($(SSE), yes)
19
-    LIB_OBJS+=sha1-mb-x86_64.o sha256-mb-x86_64.o
33
+ifneq ($(SIMD), none)
34
+    LIB_OBJS+=sha1-mb-x86_64.o sha256-mb-x86_64.o hash_simd.o poly_ssse3.o
35
+    ifneq ($(SIMD), ssse3)
36
+        LIB_OBJS+=poly_avx2.o
37
+    endif
20 38
 endif
21 39
 TEST_OBJS=test_bitstring.o test_hash.o test_idxgen.o test_key.o test_ntru.o test.o test_poly.o test_util.o
22 40
 VERSION=0.5
@@ -110,6 +128,15 @@ hybrid: lib
110 128
 $(SRCDIR)/%.o: $(SRCDIR)/%.c
111 129
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@
112 130
 
131
+$(SRCDIR)/poly_ssse3.o: $(SRCDIR)/poly_ssse3.c
132
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mssse3 -c -fPIC $(SRCDIR)/poly_ssse3.c -o $(SRCDIR)/poly_ssse3.o
133
+
134
+$(SRCDIR)/poly_avx2.o: $(SRCDIR)/poly_avx2.c
135
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mavx2 -c -fPIC $(SRCDIR)/poly_avx2.c -o $(SRCDIR)/poly_avx2.o
136
+
137
+$(SRCDIR)/hash_simd.o: $(SRCDIR)/hash_simd.c
138
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mssse3 -c -fPIC $(SRCDIR)/hash_simd.c -o $(SRCDIR)/hash_simd.o
139
+
113 140
 $(SRCDIR)/sha1-mb-x86_64.s: $(SRCDIR)/sha1-mb-x86_64.pl; CC=$(CC) ASM="$(AS)" $(PERL) $(SRCDIR)/sha1-mb-x86_64.pl $(PERLASM_SCHEME) > $@
114 141
 $(SRCDIR)/sha1-mb-x86_64.o: $(SRCDIR)/sha1-mb-x86_64.s
115 142
 	$(AS) $(SRCDIR)/sha1-mb-x86_64.s -o $@

+ 36 - 30
Makefile.osx

@@ -1,40 +1,37 @@
1 1
 CC?=gcc
2 2
 AS=$(CC) -c
3
-OPTFLAGS=-O2
4
-bench: OPTFLAGS=-O3
5
-CFLAGS=-g -Wall -Wextra -Wno-unused-parameter $(OPTFLAGS)
6
-SSSE3_FLAG = $(shell /usr/sbin/sysctl machdep.cpu.features | grep -m 1 -ow SSSE3)
7
-ifneq ($(SSE), no)
8
-    ifeq ($(SSSE3_FLAG), SSSE3)
9
-        SSE=yes
10
-    endif
11
-endif
12
-AVX2_FLAG = $(shell /usr/sbin/sysctl machdep.cpu.features | grep -m 1 -ow AVX2)
13
-ifneq ($(AVX2), no)
14
-    ifeq ($(AVX2_FLAG), AVX2)
15
-        AVX2=yes
16
-    endif
17
-endif
18
-ifeq ($(AVX2), yes)
19
-    SSE=yes
20
-endif
21
-ifeq ($(SSE), no)
22
-    AVX2=no
23
-endif
24
-ifeq ($(SSE), yes)
25
-    CFLAGS+=-mssse3
26
-endif
27
-ifeq ($(SSE), no)
28
-    CFLAGS+=-march=x86-64
3
+
4
+# set CFLAGS depending on SIMD
5
+CFLAGS?=-g
6
+CFLAGS+=-Wall -Wextra -Wno-unused-parameter
7
+ifeq ($(SIMD), ssse3)
8
+    CFLAGS+=-mssse3 -mno-avx2
9
+else ifeq ($(SIMD), avx2)
10
+    CFLAGS+=-mssse3 -mavx2
11
+else ifneq ($(SIMD), none)
12
+    SIMD=auto
13
+    CFLAGS+=-DNTRU_DETECT_SIMD
14
+else
15
+    CFLAGS+=-mno-ssse3 -mno-avx2
29 16
 endif
30
-ifeq ($(AVX2), yes)
31
-    CFLAGS+=-mavx2
17
+
18
+# use -march=native if SIMD=auto
19
+BENCH_ARCH_OPTION=
20
+ifeq ($(SIMD), auto)
21
+    BENCH_ARCH_OPTION=-march=native
32 22
 endif
23
+OPTFLAGS=-O2
24
+bench: OPTFLAGS=-O3 $(BENCH_ARCH_OPTION)
25
+CFLAGS+=$(OPTFLAGS)
26
+
33 27
 SRCDIR=src
34 28
 TESTDIR=tests
35 29
 LIB_OBJS=bitstring.o encparams.o hash.o idxgen.o key.o mgf.o ntru.o poly.o rand.o arith.o sha1.o sha2.o nist_ctr_drbg.o rijndael.o
36
-ifeq ($(SSE), yes)
37
-    LIB_OBJS+=sha1-mb-x86_64.o sha256-mb-x86_64.o
30
+ifneq ($(SIMD), none)
31
+    LIB_OBJS+=sha1-mb-x86_64.o sha256-mb-x86_64.o hash_simd.o poly_ssse3.o
32
+    ifneq ($(SIMD), ssse3)
33
+        LIB_OBJS+=poly_avx2.o
34
+    endif
38 35
 endif
39 36
 TEST_OBJS=test_bitstring.o test_hash.o test_idxgen.o test_key.o test_ntru.o test.o test_poly.o test_util.o
40 37
 VERSION=0.5
@@ -119,6 +116,15 @@ hybrid: lib
119 116
 $(SRCDIR)/%.o: $(SRCDIR)/%.c
120 117
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c -fPIC $< -o $@
121 118
 
119
+$(SRCDIR)/poly_ssse3.o: $(SRCDIR)/poly_ssse3.c
120
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mssse3 -c -fPIC $(SRCDIR)/poly_ssse3.c -o $(SRCDIR)/poly_ssse3.o
121
+
122
+$(SRCDIR)/poly_avx2.o: $(SRCDIR)/poly_avx2.c
123
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mavx2 -c -fPIC $(SRCDIR)/poly_avx2.c -o $(SRCDIR)/poly_avx2.o
124
+
125
+$(SRCDIR)/hash_simd.o: $(SRCDIR)/hash_simd.c
126
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mssse3 -c -fPIC $(SRCDIR)/hash_simd.c -o $(SRCDIR)/hash_simd.o
127
+
122 128
 $(SRCDIR)/sha1-mb-x86_64.s: $(SRCDIR)/sha1-mb-x86_64.pl; CC=$(CC) ASM="$(AS)" $(PERL) $(SRCDIR)/sha1-mb-x86_64.pl $(PERLASM_SCHEME) > $@
123 129
 $(SRCDIR)/sha1-mb-x86_64.o: $(SRCDIR)/sha1-mb-x86_64.s
124 130
 	$(AS) $(SRCDIR)/sha1-mb-x86_64.s -o $@

+ 38 - 10
Makefile.win

@@ -1,22 +1,41 @@
1 1
 CC?=gcc
2 2
 AS=$(CC) -c
3
-OPTFLAGS=-O2
4
-bench: OPTFLAGS=-O3 -march=native
5
-CFLAGS?=-g $(OPTFLAGS)
6
-CFLAGS+=-Wall -Wextra -Wno-unused-parameter
7
-ifeq ($(AVX2), yes)
8
-    CFLAGS+=-mavx2
9
-    SSE=yes
3
+
4
+ifeq ($(SIMD), sse)
5
+    SIMD=ssse3
10 6
 endif
11
-ifeq ($(SSE), yes)
7
+
8
+# set CFLAGS depending on SIMD
9
+CFLAGS?=-g
10
+CFLAGS+=-Wall -Wextra -Wno-unused-parameter
11
+ifeq ($(SIMD), ssse3)
12 12
     CFLAGS+=-mssse3
13
+else ifeq ($(SIMD), avx2)
14
+    CFLAGS+=-mssse3 -mavx2
15
+else ifneq ($(SIMD), none)
16
+    SIMD=auto
17
+    CFLAGS+=-DNTRU_DETECT_SIMD
18
+endif
19
+
20
+# use -march=native if SIMD=auto
21
+BENCH_ARCH_OPTION=
22
+ifeq ($(SIMD), auto)
23
+    BENCH_ARCH_OPTION=-march=native
13 24
 endif
25
+OPTFLAGS=-O2
26
+bench: OPTFLAGS=-O3 $(BENCH_ARCH_OPTION)
27
+CFLAGS+=$(OPTFLAGS)
28
+
14 29
 SRCDIR=src
15 30
 TESTDIR=tests
16 31
 LIB_OBJS=bitstring.o encparams.o hash.o idxgen.o key.o mgf.o ntru.o poly.o rand.o arith.o sha1.o sha2.o nist_ctr_drbg.o rijndael.o
17
-ifeq ($(SSE), yes)
18
-    LIB_OBJS+=sha1-mb-x86_64.o sha256-mb-x86_64.o
32
+ifneq ($(SIMD), none)
33
+    LIB_OBJS+=sha1-mb-x86_64.o sha256-mb-x86_64.o hash_simd.o poly_ssse3.o
34
+    ifneq ($(SIMD), ssse3)
35
+        LIB_OBJS+=poly_avx2.o
36
+    endif
19 37
 endif
38
+
20 39
 TEST_OBJS=test_bitstring.o test_hash.o test_idxgen.o test_key.o test_ntru.o test.o test_poly.o test_util.o
21 40
 VERSION=0.5
22 41
 INST_PFX=%PROGRAMFILES%
@@ -111,6 +130,15 @@ hybrid: lib
111 130
 $(SRCDIR)/%.o: $(SRCDIR)/%.c
112 131
 	$(CC) $(CFLAGS) $(CPPFLAGS) -c $< -o $@
113 132
 
133
+$(SRCDIR)/poly_ssse3.o: $(SRCDIR)/poly_ssse3.c
134
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mssse3 -c -fPIC $(SRCDIR)/poly_ssse3.c -o $(SRCDIR)/poly_ssse3.o
135
+
136
+$(SRCDIR)/poly_avx2.o: $(SRCDIR)/poly_avx2.c
137
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mavx2 -c -fPIC $(SRCDIR)/poly_avx2.c -o $(SRCDIR)/poly_avx2.o
138
+
139
+$(SRCDIR)/hash_simd.o: $(SRCDIR)/hash_simd.c
140
+	$(CC) $(CFLAGS) $(CPPFLAGS) -mssse3 -c -fPIC $(SRCDIR)/hash_simd.c -o $(SRCDIR)/hash_simd.o
141
+
114 142
 $(SRCDIR)/sha1-mb-x86_64.s: $(SRCDIR)/sha1-mb-x86_64.pl; CC=$(CC) ASM="$(AS)" $(PERL) $(SRCDIR)/sha1-mb-x86_64.pl $(PERLASM_SCHEME) > $@
115 143
 $(SRCDIR)/sha1-mb-x86_64.o: $(SRCDIR)/sha1-mb-x86_64.s
116 144
 	$(AS) $(SRCDIR)/sha1-mb-x86_64.s -o $@

+ 3 - 6
README.md

@@ -20,12 +20,9 @@ page at https://tbuktu.github.com/ntru/.
20 20
 Run ```make``` to build the library, or ```make test``` to run unit tests. ```make bench``` builds a benchmark program.
21 21
 On *BSD, use ```gmake``` instead of ```make```.
22 22
 
23
-The ```SSE``` environment variable enables SSSE3 support (```SSE=yes```)
24
-or disables it (```SSE=no```).
25
-Default on Linux, BSD, and MacOS is to autodetect SSSE3 on the build host,
26
-Windows default is no SSSE3.
27
-
28
-The ```AVX2``` environment variable controls AVX2 support and works just like the ```SSE``` variable.
23
+The ```SIMD``` environment variable controls SSSE3 and AVX2 support.
24
+The default is ```auto``` which means SSSE3 and AVX2 are detected at runtime.
25
+Other values are ```none```, ```ssse3```, and ```avx2```.
29 26
 
30 27
 If the ```NTRU_AVOID_HAMMING_WT_PATENT``` preprocessor flag is supplied, the library won't support
31 28
 parameter sets that will be patent encumbered after Aug 19, 2017. See the *Parameter Sets* section

+ 68 - 516
src/hash.c

@@ -1,564 +1,116 @@
1 1
 #include <string.h>
2 2
 #include <stdint.h>
3
-#if defined __SSSE3__ && _LP64
4
-#include <tmmintrin.h>
5
-#endif
6
-#ifdef WIN32
7
-#include <Winsock2.h>
8
-#else
9
-#include <netinet/in.h>
10
-#endif
11 3
 #include "sph_sha1.h"
12 4
 #include "sph_sha2.h"
13 5
 #include "hash.h"
6
+#include "hash_simd.h"
14 7
 
15
-void ntru_sha1(uint8_t *input, uint16_t input_len, uint8_t *digest) {
16
-    sph_sha1_context context;
17
-    sph_sha1_init(&context);
18
-    sph_sha1(&context, input, input_len);
19
-    sph_sha1_close(&context, digest);
20
-}
21
-
22
-void ntru_sha256(uint8_t *input, uint16_t input_len, uint8_t *digest) {
23
-    sph_sha256_context context;
24
-    sph_sha256_init(&context);
25
-    sph_sha256(&context, input, input_len);
26
-    sph_sha256_close(&context, digest);
27
-}
28
-
29
-#if defined __SSSE3__ && _LP64
30
-typedef struct {
31
-    uint32_t A[8], B[8], C[8], D[8], E[8];
32
-    uint32_t Nl,Nh;
33
-    uint32_t data[8][16];
34
-    uint8_t num;   /* 1 or 2 */
35
-} SHA1_MB_CTX;
36
-typedef struct {
37
-    uint32_t A[8];
38
-    uint32_t B[8];
39
-    uint32_t C[8];
40
-    uint32_t D[8];
41
-    uint32_t E[8];
42
-    uint32_t F[8];
43
-    uint32_t G[8];
44
-    uint32_t H[8];
45
-    uint32_t Nl, Nh;
46
-    uint8_t num;   /* 1 or 2 */
47
-    uint32_t data[8][16];
48
-} SHA256_MB_CTX;
49
-typedef struct {
50
-    uint8_t *ptr;
51
-    uint32_t blocks;
52
-} HASH_DESC;
53
-
8
+#ifdef NTRU_DETECT_SIMD
9
+uint32_t OPENSSL_ia32cap_P[] __attribute__((visibility("hidden"))) = {0, 0, 0, 0};
10
+#else
54 11
 #ifdef __AVX2__
55 12
 /* don’t detect SHA extensions for now, just report AVX/AVX2 */
56 13
 uint32_t OPENSSL_ia32cap_P[] __attribute__((visibility("hidden"))) = {0, 1<<28, 1<<5, 0};
57 14
 #else
58 15
 uint32_t OPENSSL_ia32cap_P[] __attribute__((visibility("hidden"))) = {0, 0, 0, 0};
59 16
 #endif
17
+#endif   /* NTRU_DETECT_SIMD */
60 18
 
61
-extern void sha1_multi_block(SHA1_MB_CTX *, HASH_DESC *, int num);
62
-
63
-extern void sha256_multi_block(SHA256_MB_CTX *, HASH_DESC *, int num);
64
-
65
-void SHA1_MB_Init(SHA1_MB_CTX *ctx) {
66
-    memset(ctx, 0, sizeof(*ctx));
67
-    __m128i a = _mm_set1_epi32(0x67452301);
68
-    __m128i b = _mm_set1_epi32(0xefcdab89);
69
-    __m128i c = _mm_set1_epi32(0x98badcfe);
70
-    __m128i d = _mm_set1_epi32(0x10325476);
71
-    __m128i e = _mm_set1_epi32(0xc3d2e1f0);
72
-    _mm_storeu_si128((__m128i*)&ctx->A, a);
73
-    _mm_storeu_si128((__m128i*)&ctx->B, b);
74
-    _mm_storeu_si128((__m128i*)&ctx->C, c);
75
-    _mm_storeu_si128((__m128i*)&ctx->D, d);
76
-    _mm_storeu_si128((__m128i*)&ctx->E, e);
77
-}
78
-
79
-void SHA1_MB_Init8(SHA1_MB_CTX *ctx) {
80
-    /* init A[0]..A[3], B[0]..B[3], ... */
81
-    SHA1_MB_Init(ctx);
82
-    /* init A[4]..A[7], B[4]..B[7], ... */
83
-    __m128i a = _mm_set1_epi32(0x67452301);
84
-    __m128i b = _mm_set1_epi32(0xefcdab89);
85
-    __m128i c = _mm_set1_epi32(0x98badcfe);
86
-    __m128i d = _mm_set1_epi32(0x10325476);
87
-    __m128i e = _mm_set1_epi32(0xc3d2e1f0);
88
-    _mm_storeu_si128((__m128i*)&ctx->A[4], a);
89
-    _mm_storeu_si128((__m128i*)&ctx->B[4], b);
90
-    _mm_storeu_si128((__m128i*)&ctx->C[4], c);
91
-    _mm_storeu_si128((__m128i*)&ctx->D[4], d);
92
-    _mm_storeu_si128((__m128i*)&ctx->E[4], e);
93
-}
94
-
95
-void SHA1_MB_Update(SHA1_MB_CTX *ctx, uint8_t *data[4], size_t len) {
96
-    if (len == 0)
97
-        return;
98
-
99
-    uint32_t l = (ctx->Nl + (((uint32_t) len) << 3)) & 0xffffffffUL;
100
-    /*
101
-     * 95-05-24 eay Fixed a bug with the overflow handling, thanks to Wei Dai
102
-     * <weidai@eskimo.com> for pointing it out.
103
-     */
104
-    if (l < ctx->Nl)              /* overflow */
105
-        ctx->Nh++;
106
-    ctx->Nh += (uint32_t) (len >> 29); /* might cause compiler warning on
107
-                                       * 16-bit */
108
-    ctx->Nl = l;
109
-
110
-    uint8_t *data_[4];
111
-    uint8_t i;
112
-    for (i=0; i<4; i++)
113
-        data_[i] = data[i];
114
-
115
-    size_t n = len / 64;
116
-    if (n > 0) {
117
-        HASH_DESC hdesc[4];
118
-        for (i=0; i<4; i++) {
119
-            hdesc[i].ptr = data[i];
120
-            hdesc[i].blocks = n;
121
-        }
122
-        sha1_multi_block(ctx, hdesc, 1);
123
-        n *= 64;
124
-        for (i=0; i<4; i++)
125
-            data_[i] += n;
126
-        len -= n;
127
-    }
128
-
129
-    if (len != 0) {
130
-        ctx->num = (uint32_t)len;
131
-        for (i=0; i<4; i++) {
132
-            uint8_t *d = (uint8_t*)ctx->data[i];
133
-            memcpy(d, data_[i], len);
134
-        }
135
-    }
136
-}
137
-
138
-void SHA1_MB_Update8(SHA1_MB_CTX *ctx, uint8_t *data[8], size_t len) {
139
-    if (len == 0)
140
-        return;
141
-
142
-    uint32_t l = (ctx->Nl + (((uint32_t) len) << 3)) & 0xffffffffUL;
143
-    /*
144
-     * 95-05-24 eay Fixed a bug with the overflow handling, thanks to Wei Dai
145
-     * <weidai@eskimo.com> for pointing it out.
146
-     */
147
-    if (l < ctx->Nl)              /* overflow */
148
-        ctx->Nh++;
149
-    ctx->Nh += (uint32_t) (len >> 29); /* might cause compiler warning on
150
-                                       * 16-bit */
151
-    ctx->Nl = l;
152
-
153
-    uint8_t *data_[8];
154
-    uint8_t i;
155
-    for (i=0; i<8; i++)
156
-        data_[i] = data[i];
157
-
158
-    size_t n = len / 64;
159
-    if (n > 0) {
160
-        HASH_DESC hdesc[8];
161
-        for (i=0; i<8; i++) {
162
-            hdesc[i].ptr = data[i];
163
-            hdesc[i].blocks = n;
164
-        }
165
-        sha1_multi_block(ctx, hdesc, 2);
166
-        n *= 64;
167
-        for (i=0; i<8; i++)
168
-            data_[i] += n;
169
-        len -= n;
170
-    }
171
-
172
-    if (len != 0) {
173
-        ctx->num = (uint32_t)len;
174
-        for (i=0; i<8; i++) {
175
-            uint8_t *d = (uint8_t*)ctx->data[i];
176
-            memcpy(d, data_[i], len);
177
-        }
178
-    }
179
-}
180
-
181
-void SHA1_MB_Final(uint8_t *digest[4], SHA1_MB_CTX *ctx) {
182
-    size_t n = ctx->num;
183
-    uint8_t i;
184
-    for (i=0; i<4; i++) {
185
-        uint8_t *d = (uint8_t*)ctx->data[i];
186
-        *(d+n) = 0x80;
187
-    }
188
-    n++;
189
-    for (i=0; i<4; i++)
190
-        memset(((uint8_t*)ctx->data[i]) + n, 0, 64 - n);
191
-
192
-    if (n > (64 - 8)) {
193
-        n = 0;
194
-        HASH_DESC hdesc[4];
195
-        for (i=0; i<4; i++) {
196
-            hdesc[i].ptr = (uint8_t*)ctx->data[i];
197
-            hdesc[i].blocks = 1;
198
-        }
199
-        sha1_multi_block(ctx, hdesc, 1);
200
-    }
201
-    for (i=0; i<4; i++) {
202
-        uint8_t *d = (uint8_t*)&ctx->data[i];
203
-        memset(d+n, 0, 64-8-n);
204
-
205
-        d += 64 - 8;
206
-        uint32_t *d32 = (uint32_t*)d;
207
-        *d32 = ntohl(ctx->Nh);
208
-
209
-        d += 4;
210
-        d32 = (uint32_t*)d;
211
-        *d32 = ntohl(ctx->Nl);
212
-    }
213
-    HASH_DESC hdesc[4];
214
-    for (i=0; i<4; i++) {
215
-        hdesc[i].ptr = (uint8_t*)ctx->data[i];
216
-        hdesc[i].blocks = 1;
217
-    }
218
-    sha1_multi_block(ctx, hdesc, 1);
219
-
220
-    for (i=0; i<4; i++) {
221
-        uint32_t *d32 = (uint32_t*)digest[i];
222
-        *(d32++) = ntohl(ctx->A[i]);
223
-        *(d32++) = ntohl(ctx->B[i]);
224
-        *(d32++) = ntohl(ctx->C[i]);
225
-        *(d32++) = ntohl(ctx->D[i]);
226
-        *d32 = ntohl(ctx->E[i]);
227
-    }
228
-}
229
-
230
-void SHA1_MB_Final8(uint8_t *digest[8], SHA1_MB_CTX *ctx) {
231
-    size_t n = ctx->num;
232
-    uint8_t i;
233
-    for (i=0; i<8; i++) {
234
-        uint8_t *d = (uint8_t*)ctx->data[i];
235
-        *(d+n) = 0x80;
236
-    }
237
-    n++;
238
-    for (i=0; i<8; i++)
239
-        memset(((uint8_t*)ctx->data[i]) + n, 0, 64 - n);
240
-
241
-    if (n > (64 - 8)) {
242
-        n = 0;
243
-        HASH_DESC hdesc[8];
244
-        for (i=0; i<8; i++) {
245
-            hdesc[i].ptr = (uint8_t*)ctx->data[i];
246
-            hdesc[i].blocks = 1;
247
-        }
248
-        sha1_multi_block(ctx, hdesc, 2);
249
-    }
250
-    for (i=0; i<8; i++) {
251
-        uint8_t *d = (uint8_t*)&ctx->data[i];
252
-        memset(d+n, 0, 64-8-n);
253
-
254
-        d += 64 - 8;
255
-        uint32_t *d32 = (uint32_t*)d;
256
-        *d32 = ntohl(ctx->Nh);
257
-
258
-        d += 4;
259
-        d32 = (uint32_t*)d;
260
-        *d32 = ntohl(ctx->Nl);
261
-    }
262
-    HASH_DESC hdesc[8];
263
-    for (i=0; i<8; i++) {
264
-        hdesc[i].ptr = (uint8_t*)ctx->data[i];
265
-        hdesc[i].blocks = 1;
266
-    }
267
-    sha1_multi_block(ctx, hdesc, 2);
268
-
269
-    for (i=0; i<8; i++) {
270
-        uint32_t *d32 = (uint32_t*)digest[i];
271
-        *(d32++) = ntohl(ctx->A[i]);
272
-        *(d32++) = ntohl(ctx->B[i]);
273
-        *(d32++) = ntohl(ctx->C[i]);
274
-        *(d32++) = ntohl(ctx->D[i]);
275
-        *d32 = ntohl(ctx->E[i]);
276
-    }
277
-}
278
-
279
-void ntru_sha1_4way(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]) {
280
-    SHA1_MB_CTX ctx;
281
-    SHA1_MB_Init(&ctx);
282
-    SHA1_MB_Update(&ctx, input, input_len);
283
-    SHA1_MB_Final(digest, &ctx);
284
-}
285
-
286
-void ntru_sha1_8way(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]) {
287
-    SHA1_MB_CTX ctx;
288
-    SHA1_MB_Init8(&ctx);
289
-    SHA1_MB_Update8(&ctx, input, input_len);
290
-    SHA1_MB_Final8(digest, &ctx);
291
-}
292
-
293
-void SHA256_MB_Init(SHA256_MB_CTX *ctx) {
294
-    memset(ctx, 0, sizeof(*ctx));
295
-    __m128i a = _mm_set1_epi32(0x6a09e667);
296
-    __m128i b = _mm_set1_epi32(0xbb67ae85);
297
-    __m128i c = _mm_set1_epi32(0x3c6ef372);
298
-    __m128i d = _mm_set1_epi32(0xa54ff53a);
299
-    __m128i e = _mm_set1_epi32(0x510e527f);
300
-    __m128i f = _mm_set1_epi32(0x9b05688c);
301
-    __m128i g = _mm_set1_epi32(0x1f83d9ab);
302
-    __m128i h = _mm_set1_epi32(0x5be0cd19);
303
-    _mm_storeu_si128((__m128i*)&ctx->A, a);
304
-    _mm_storeu_si128((__m128i*)&ctx->B, b);
305
-    _mm_storeu_si128((__m128i*)&ctx->C, c);
306
-    _mm_storeu_si128((__m128i*)&ctx->D, d);
307
-    _mm_storeu_si128((__m128i*)&ctx->E, e);
308
-    _mm_storeu_si128((__m128i*)&ctx->F, f);
309
-    _mm_storeu_si128((__m128i*)&ctx->G, g);
310
-    _mm_storeu_si128((__m128i*)&ctx->H, h);
311
-}
19
+void (*ntru_sha1_4way_ptr)(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]);
312 20
 
313
-void SHA256_MB_Init8(SHA256_MB_CTX *ctx) {
314
-    /* init A[0]..A[3], B[0]..B[3], ... */
315
-    SHA256_MB_Init(ctx);
316
-    /* init A[4]..A[7], B[4]..B[7], ... */
317
-    __m128i a = _mm_set1_epi32(0x6a09e667);
318
-    __m128i b = _mm_set1_epi32(0xbb67ae85);
319
-    __m128i c = _mm_set1_epi32(0x3c6ef372);
320
-    __m128i d = _mm_set1_epi32(0xa54ff53a);
321
-    __m128i e = _mm_set1_epi32(0x510e527f);
322
-    __m128i f = _mm_set1_epi32(0x9b05688c);
323
-    __m128i g = _mm_set1_epi32(0x1f83d9ab);
324
-    __m128i h = _mm_set1_epi32(0x5be0cd19);
325
-    _mm_storeu_si128((__m128i*)&ctx->A[4], a);
326
-    _mm_storeu_si128((__m128i*)&ctx->B[4], b);
327
-    _mm_storeu_si128((__m128i*)&ctx->C[4], c);
328
-    _mm_storeu_si128((__m128i*)&ctx->D[4], d);
329
-    _mm_storeu_si128((__m128i*)&ctx->E[4], e);
330
-    _mm_storeu_si128((__m128i*)&ctx->F[4], f);
331
-    _mm_storeu_si128((__m128i*)&ctx->G[4], g);
332
-    _mm_storeu_si128((__m128i*)&ctx->H[4], h);
333
-}
21
+void (*ntru_sha256_4way_ptr)(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]);
334 22
 
335
-void SHA256_MB_Update(SHA256_MB_CTX *ctx, uint8_t *data[4], size_t len) {
336
-    if (len == 0)
337
-        return;
23
+void (*ntru_sha1_8way_ptr)(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]);
338 24
 
339
-    uint32_t l = (ctx->Nl + (((uint32_t) len) << 3)) & 0xffffffffUL;
340
-    /*
341
-     * 95-05-24 eay Fixed a bug with the overflow handling, thanks to Wei Dai
342
-     * <weidai@eskimo.com> for pointing it out.
343
-     */
344
-    if (l < ctx->Nl)              /* overflow */
345
-        ctx->Nh++;
346
-    ctx->Nh += (uint32_t) (len >> 29); /* might cause compiler warning on
347
-                                       * 16-bit */
348
-    ctx->Nl = l;
25
+void (*ntru_sha256_8way_ptr)(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]);
349 26
 
350
-    uint8_t *data_[4];
351
-    uint8_t i;
352
-    for (i=0; i<4; i++)
353
-        data_[i] = data[i];
354
-
355
-    size_t n = len / 64;
356
-    if (n > 0) {
357
-        HASH_DESC hdesc[4];
358
-        for (i=0; i<4; i++) {
359
-            hdesc[i].ptr = data[i];
360
-            hdesc[i].blocks = n;
361
-        }
362
-        sha256_multi_block(ctx, hdesc, 1);
363
-        n *= 64;
364
-        for (i=0; i<4; i++)
365
-            data_[i] += n;
366
-        len -= n;
367
-    }
368
-
369
-    if (len != 0) {
370
-        ctx->num = (uint32_t)len;
371
-        for (i=0; i<4; i++) {
372
-            uint8_t *d = (uint8_t*)ctx->data[i];
373
-            memcpy(d, data_[i], len);
374
-        }
375
-    }
27
+inline void ntru_sha1_4way(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]) {
28
+    ntru_sha1_4way_ptr(input, input_len, digest);
376 29
 }
377 30
 
378
-void SHA256_MB_Update8(SHA256_MB_CTX *ctx, uint8_t *data[8], size_t len) {
379
-    if (len == 0)
380
-        return;
381
-
382
-    uint32_t l = (ctx->Nl + (((uint32_t) len) << 3)) & 0xffffffffUL;
383
-    /*
384
-     * 95-05-24 eay Fixed a bug with the overflow handling, thanks to Wei Dai
385
-     * <weidai@eskimo.com> for pointing it out.
386
-     */
387
-    if (l < ctx->Nl)              /* overflow */
388
-        ctx->Nh++;
389
-    ctx->Nh += (uint32_t) (len >> 29); /* might cause compiler warning on
390
-                                       * 16-bit */
391
-    ctx->Nl = l;
392
-
393
-    uint8_t *data_[8];
394
-    uint8_t i;
395
-    for (i=0; i<8; i++)
396
-        data_[i] = data[i];
397
-
398
-    size_t n = len / 64;
399
-    if (n > 0) {
400
-        HASH_DESC hdesc[8];
401
-        for (i=0; i<8; i++) {
402
-            hdesc[i].ptr = data[i];
403
-            hdesc[i].blocks = n;
404
-        }
405
-        sha256_multi_block(ctx, hdesc, 2);
406
-        n *= 64;
407
-        for (i=0; i<8; i++)
408
-            data_[i] += n;
409
-        len -= n;
410
-    }
411
-
412
-    if (len != 0) {
413
-        ctx->num = (uint32_t)len;
414
-        for (i=0; i<8; i++) {
415
-            uint8_t *d = (uint8_t*)ctx->data[i];
416
-            memcpy(d, data_[i], len);
417
-        }
418
-    }
31
+void ntru_sha256_4way(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]) {
32
+    ntru_sha256_4way_ptr(input, input_len, digest);
419 33
 }
420 34
 
421
-void SHA256_MB_Final(uint8_t *digest[4], SHA256_MB_CTX *ctx) {
422
-    size_t n = ctx->num;
423
-    uint8_t i;
424
-    for (i=0; i<4; i++) {
425
-        uint8_t *d = (uint8_t*)ctx->data[i];
426
-        *(d+n) = 0x80;
427
-    }
428
-    n++;
429
-    for (i=0; i<4; i++)
430
-        memset(((uint8_t*)ctx->data[i]) + n, 0, 64 - n);
431
-
432
-    if (n > (64 - 8)) {
433
-        n = 0;
434
-        HASH_DESC hdesc[4];
435
-        for (i=0; i<4; i++) {
436
-            hdesc[i].ptr = (uint8_t*)ctx->data[i];
437
-            hdesc[i].blocks = 1;
438
-        }
439
-        sha256_multi_block(ctx, hdesc, 1);
440
-    }
441
-    for (i=0; i<4; i++) {
442
-        uint8_t *d = (uint8_t*)&ctx->data[i];
443
-        memset(d+n, 0, 64-8-n);
444
-
445
-        d += 64 - 8;
446
-        uint32_t *d32 = (uint32_t*)d;
447
-        *d32 = ntohl(ctx->Nh);
448
-
449
-        d += 4;
450
-        d32 = (uint32_t*)d;
451
-        *d32 = ntohl(ctx->Nl);
452
-    }
453
-    HASH_DESC hdesc[4];
454
-    for (i=0; i<4; i++) {
455
-        hdesc[i].ptr = (uint8_t*)ctx->data[i];
456
-        hdesc[i].blocks = 1;
457
-    }
458
-    sha256_multi_block(ctx, hdesc, 1);
459
-
460
-    for (i=0; i<4; i++) {
461
-        uint32_t *d32 = (uint32_t*)digest[i];
462
-        *(d32++) = ntohl(ctx->A[i]);
463
-        *(d32++) = ntohl(ctx->B[i]);
464
-        *(d32++) = ntohl(ctx->C[i]);
465
-        *(d32++) = ntohl(ctx->D[i]);
466
-        *(d32++) = ntohl(ctx->E[i]);
467
-        *(d32++) = ntohl(ctx->F[i]);
468
-        *(d32++) = ntohl(ctx->G[i]);
469
-        *d32 = ntohl(ctx->H[i]);
470
-    }
35
+inline void ntru_sha1_8way(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]) {
36
+    ntru_sha1_8way_ptr(input, input_len, digest);
471 37
 }
472 38
 
473
-void SHA256_MB_Final8(uint8_t *digest[8], SHA256_MB_CTX *ctx) {
474
-    size_t n = ctx->num;
475
-    uint8_t i;
476
-    for (i=0; i<8; i++) {
477
-        uint8_t *d = (uint8_t*)ctx->data[i];
478
-        *(d+n) = 0x80;
479
-    }
480
-    n++;
481
-    for (i=0; i<8; i++)
482
-        memset(((uint8_t*)ctx->data[i]) + n, 0, 64 - n);
483
-
484
-    if (n > (64 - 8)) {
485
-        n = 0;
486
-        HASH_DESC hdesc[8];
487
-        for (i=0; i<8; i++) {
488
-            hdesc[i].ptr = (uint8_t*)ctx->data[i];
489
-            hdesc[i].blocks = 1;
490
-        }
491
-        sha256_multi_block(ctx, hdesc, 2);
492
-    }
493
-    for (i=0; i<8; i++) {
494
-        uint8_t *d = (uint8_t*)&ctx->data[i];
495
-        memset(d+n, 0, 64-8-n);
496
-
497
-        d += 64 - 8;
498
-        uint32_t *d32 = (uint32_t*)d;
499
-        *d32 = ntohl(ctx->Nh);
500
-
501
-        d += 4;
502
-        d32 = (uint32_t*)d;
503
-        *d32 = ntohl(ctx->Nl);
504
-    }
505
-    HASH_DESC hdesc[8];
506
-    for (i=0; i<8; i++) {
507
-        hdesc[i].ptr = (uint8_t*)ctx->data[i];
508
-        hdesc[i].blocks = 1;
509
-    }
510
-    sha256_multi_block(ctx, hdesc, 2);
511
-
512
-    for (i=0; i<8; i++) {
513
-        uint32_t *d32 = (uint32_t*)digest[i];
514
-        *(d32++) = ntohl(ctx->A[i]);
515
-        *(d32++) = ntohl(ctx->B[i]);
516
-        *(d32++) = ntohl(ctx->C[i]);
517
-        *(d32++) = ntohl(ctx->D[i]);
518
-        *(d32++) = ntohl(ctx->E[i]);
519
-        *(d32++) = ntohl(ctx->F[i]);
520
-        *(d32++) = ntohl(ctx->G[i]);
521
-        *d32 = ntohl(ctx->H[i]);
522
-    }
39
+void ntru_sha256_8way(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]) {
40
+    ntru_sha256_8way_ptr(input, input_len, digest);
523 41
 }
524 42
 
525
-void ntru_sha256_4way(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]) {
526
-    SHA256_MB_CTX ctx;
527
-    SHA256_MB_Init(&ctx);
528
-    SHA256_MB_Update(&ctx, input, input_len);
529
-    SHA256_MB_Final(digest, &ctx);
43
+void ntru_sha1(uint8_t *input, uint16_t input_len, uint8_t *digest) {
44
+    sph_sha1_context context;
45
+    sph_sha1_init(&context);
46
+    sph_sha1(&context, input, input_len);
47
+    sph_sha1_close(&context, digest);
530 48
 }
531 49
 
532
-void ntru_sha256_8way(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]) {
533
-    SHA256_MB_CTX ctx;
534
-    SHA256_MB_Init8(&ctx);
535
-    SHA256_MB_Update8(&ctx, input, input_len);
536
-    SHA256_MB_Final8(digest, &ctx);
50
+void ntru_sha256(uint8_t *input, uint16_t input_len, uint8_t *digest) {
51
+    sph_sha256_context context;
52
+    sph_sha256_init(&context);
53
+    sph_sha256(&context, input, input_len);
54
+    sph_sha256_close(&context, digest);
537 55
 }
538 56
 
539
-#else   /* non-SSE code */
540
-
541
-void ntru_sha1_4way(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]) {
57
+void ntru_sha1_4way_nosimd(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]) {
542 58
     uint8_t i;
543 59
     for (i=0; i<4; i++)
544 60
         ntru_sha1(input[i], input_len, digest[i]);
545 61
 }
546 62
 
547
-void ntru_sha1_8way(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]) {
63
+void ntru_sha1_8way_nosimd(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]) {
548 64
     uint8_t i;
549 65
     for (i=0; i<8; i++)
550 66
         ntru_sha1(input[i], input_len, digest[i]);
551 67
 }
552 68
 
553
-void ntru_sha256_4way(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]) {
69
+void ntru_sha256_4way_nosimd(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]) {
554 70
     uint8_t i;
555 71
     for (i=0; i<4; i++)
556 72
         ntru_sha256(input[i], input_len, digest[i]);
557 73
 }
558 74
 
559
-void ntru_sha256_8way(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]) {
75
+void ntru_sha256_8way_nosimd(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]) {
560 76
     uint8_t i;
561 77
     for (i=0; i<8; i++)
562 78
         ntru_sha256(input[i], input_len, digest[i]);
563 79
 }
564
-#endif   /* __SSSE3__ && _LP64 */
80
+
81
+void ntru_set_optimized_impl_hash() {
82
+#ifdef NTRU_DETECT_SIMD
83
+    if (__builtin_cpu_supports("ssse3") || __builtin_cpu_supports("avx2")) {
84
+        ntru_sha1_4way_ptr = ntru_sha1_4way_simd;
85
+        ntru_sha256_4way_ptr = ntru_sha256_4way_simd;
86
+        ntru_sha1_8way_ptr = ntru_sha1_8way_simd;
87
+        ntru_sha256_8way_ptr = ntru_sha256_8way_simd;
88
+        if (__builtin_cpu_supports("avx2")) {
89
+            OPENSSL_ia32cap_P[1] = 1<<28;
90
+            OPENSSL_ia32cap_P[2] = 1<<5;
91
+        }
92
+    }
93
+    else {
94
+        ntru_sha1_4way_ptr = ntru_sha1_4way_nosimd;
95
+        ntru_sha256_4way_ptr = ntru_sha256_4way_nosimd;
96
+        ntru_sha1_8way_ptr = ntru_sha1_8way_nosimd;
97
+        ntru_sha256_8way_ptr = ntru_sha256_8way_nosimd;
98
+    }
99
+#else
100
+#if defined __SSSE3__ || __AVX2__
101
+    ntru_sha1_4way_ptr = ntru_sha1_4way_simd;
102
+    ntru_sha256_4way_ptr = ntru_sha256_4way_simd;
103
+    ntru_sha1_8way_ptr = ntru_sha1_8way_simd;
104
+    ntru_sha256_8way_ptr = ntru_sha256_8way_simd;
105
+#ifdef __AVX2__
106
+    OPENSSL_ia32cap_P[1] = 1<<28;
107
+    OPENSSL_ia32cap_P[2] = 1<<5;
108
+#endif
109
+#else
110
+    ntru_sha1_4way_ptr = ntru_sha1_4way_nosimd;
111
+    ntru_sha256_4way_ptr = ntru_sha256_4way_nosimd;
112
+    ntru_sha1_8way_ptr = ntru_sha1_8way_nosimd;
113
+    ntru_sha256_8way_ptr = ntru_sha256_8way_nosimd;
114
+#endif   /*  __SSSE3__ || __AVX2__ */
115
+#endif   /* NTRU_DETECT_SIMD */
116
+}

+ 8 - 0
src/hash.h

@@ -15,4 +15,12 @@ void ntru_sha256_4way(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4])
15 15
 
16 16
 void ntru_sha256_8way(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]);
17 17
 
18
+/**
19
+ * @brief Choose fastest implementation
20
+ *
21
+ * Sets function pointers for SHA-* functions so the most efficient
22
+ * variant is used.
23
+ */
24
+void ntru_set_optimized_impl_hash();
25
+
18 26
 #endif   /* NTRU_HASH_H */

+ 513 - 0
src/hash_simd.c

@@ -0,0 +1,513 @@
1
+#include <string.h>
2
+#include <stdint.h>
3
+#ifdef __SSSE3__
4
+#include <tmmintrin.h>
5
+#endif
6
+#ifdef WIN32
7
+#include <Winsock2.h>
8
+#else
9
+#include <netinet/in.h>
10
+#endif
11
+#include "hash_simd.h"
12
+
13
+typedef struct {
14
+    uint32_t A[8], B[8], C[8], D[8], E[8];
15
+    uint32_t Nl,Nh;
16
+    uint32_t data[8][16];
17
+    uint8_t num;   /* 1 or 2 */
18
+} SHA1_MB_CTX;
19
+typedef struct {
20
+    uint32_t A[8];
21
+    uint32_t B[8];
22
+    uint32_t C[8];
23
+    uint32_t D[8];
24
+    uint32_t E[8];
25
+    uint32_t F[8];
26
+    uint32_t G[8];
27
+    uint32_t H[8];
28
+    uint32_t Nl, Nh;
29
+    uint8_t num;   /* 1 or 2 */
30
+    uint32_t data[8][16];
31
+} SHA256_MB_CTX;
32
+typedef struct {
33
+    uint8_t *ptr;
34
+    uint32_t blocks;
35
+} HASH_DESC;
36
+
37
+extern void sha1_multi_block(SHA1_MB_CTX *, HASH_DESC *, int num);
38
+
39
+extern void sha256_multi_block(SHA256_MB_CTX *, HASH_DESC *, int num);
40
+
41
+void SHA1_MB_Init(SHA1_MB_CTX *ctx) {
42
+    memset(ctx, 0, sizeof(*ctx));
43
+    __m128i a = _mm_set1_epi32(0x67452301);
44
+    __m128i b = _mm_set1_epi32(0xefcdab89);
45
+    __m128i c = _mm_set1_epi32(0x98badcfe);
46
+    __m128i d = _mm_set1_epi32(0x10325476);
47
+    __m128i e = _mm_set1_epi32(0xc3d2e1f0);
48
+    _mm_storeu_si128((__m128i*)&ctx->A, a);
49
+    _mm_storeu_si128((__m128i*)&ctx->B, b);
50
+    _mm_storeu_si128((__m128i*)&ctx->C, c);
51
+    _mm_storeu_si128((__m128i*)&ctx->D, d);
52
+    _mm_storeu_si128((__m128i*)&ctx->E, e);
53
+}
54
+
55
+void SHA1_MB_Init8(SHA1_MB_CTX *ctx) {
56
+    /* init A[0]..A[3], B[0]..B[3], ... */
57
+    SHA1_MB_Init(ctx);
58
+    /* init A[4]..A[7], B[4]..B[7], ... */
59
+    __m128i a = _mm_set1_epi32(0x67452301);
60
+    __m128i b = _mm_set1_epi32(0xefcdab89);
61
+    __m128i c = _mm_set1_epi32(0x98badcfe);
62
+    __m128i d = _mm_set1_epi32(0x10325476);
63
+    __m128i e = _mm_set1_epi32(0xc3d2e1f0);
64
+    _mm_storeu_si128((__m128i*)&ctx->A[4], a);
65
+    _mm_storeu_si128((__m128i*)&ctx->B[4], b);
66
+    _mm_storeu_si128((__m128i*)&ctx->C[4], c);
67
+    _mm_storeu_si128((__m128i*)&ctx->D[4], d);
68
+    _mm_storeu_si128((__m128i*)&ctx->E[4], e);
69
+}
70
+
71
+void SHA1_MB_Update(SHA1_MB_CTX *ctx, uint8_t *data[4], size_t len) {
72
+    if (len == 0)
73
+        return;
74
+
75
+    uint32_t l = (ctx->Nl + (((uint32_t) len) << 3)) & 0xffffffffUL;
76
+    /*
77
+     * 95-05-24 eay Fixed a bug with the overflow handling, thanks to Wei Dai
78
+     * <weidai@eskimo.com> for pointing it out.
79
+     */
80
+    if (l < ctx->Nl)              /* overflow */
81
+        ctx->Nh++;
82
+    ctx->Nh += (uint32_t) (len >> 29); /* might cause compiler warning on
83
+                                       * 16-bit */
84
+    ctx->Nl = l;
85
+
86
+    uint8_t *data_[4];
87
+    uint8_t i;
88
+    for (i=0; i<4; i++)
89
+        data_[i] = data[i];
90
+
91
+    size_t n = len / 64;
92
+    if (n > 0) {
93
+        HASH_DESC hdesc[4];
94
+        for (i=0; i<4; i++) {
95
+            hdesc[i].ptr = data[i];
96
+            hdesc[i].blocks = n;
97
+        }
98
+        sha1_multi_block(ctx, hdesc, 1);
99
+        n *= 64;
100
+        for (i=0; i<4; i++)
101
+            data_[i] += n;
102
+        len -= n;
103
+    }
104
+
105
+    if (len != 0) {
106
+        ctx->num = (uint32_t)len;
107
+        for (i=0; i<4; i++) {
108
+            uint8_t *d = (uint8_t*)ctx->data[i];
109
+            memcpy(d, data_[i], len);
110
+        }
111
+    }
112
+}
113
+
114
+void SHA1_MB_Update8(SHA1_MB_CTX *ctx, uint8_t *data[8], size_t len) {
115
+    if (len == 0)
116
+        return;
117
+
118
+    uint32_t l = (ctx->Nl + (((uint32_t) len) << 3)) & 0xffffffffUL;
119
+    /*
120
+     * 95-05-24 eay Fixed a bug with the overflow handling, thanks to Wei Dai
121
+     * <weidai@eskimo.com> for pointing it out.
122
+     */
123
+    if (l < ctx->Nl)              /* overflow */
124
+        ctx->Nh++;
125
+    ctx->Nh += (uint32_t) (len >> 29); /* might cause compiler warning on
126
+                                       * 16-bit */
127
+    ctx->Nl = l;
128
+
129
+    uint8_t *data_[8];
130
+    uint8_t i;
131
+    for (i=0; i<8; i++)
132
+        data_[i] = data[i];
133
+
134
+    size_t n = len / 64;
135
+    if (n > 0) {
136
+        HASH_DESC hdesc[8];
137
+        for (i=0; i<8; i++) {
138
+            hdesc[i].ptr = data[i];
139
+            hdesc[i].blocks = n;
140
+        }
141
+        sha1_multi_block(ctx, hdesc, 2);
142
+        n *= 64;
143
+        for (i=0; i<8; i++)
144
+            data_[i] += n;
145
+        len -= n;
146
+    }
147
+
148
+    if (len != 0) {
149
+        ctx->num = (uint32_t)len;
150
+        for (i=0; i<8; i++) {
151
+            uint8_t *d = (uint8_t*)ctx->data[i];
152
+            memcpy(d, data_[i], len);
153
+        }
154
+    }
155
+}
156
+
157
+void SHA1_MB_Final(uint8_t *digest[4], SHA1_MB_CTX *ctx) {
158
+    size_t n = ctx->num;
159
+    uint8_t i;
160
+    for (i=0; i<4; i++) {
161
+        uint8_t *d = (uint8_t*)ctx->data[i];
162
+        *(d+n) = 0x80;
163
+    }
164
+    n++;
165
+    for (i=0; i<4; i++)
166
+        memset(((uint8_t*)ctx->data[i]) + n, 0, 64 - n);
167
+
168
+    if (n > (64 - 8)) {
169
+        n = 0;
170
+        HASH_DESC hdesc[4];
171
+        for (i=0; i<4; i++) {
172
+            hdesc[i].ptr = (uint8_t*)ctx->data[i];
173
+            hdesc[i].blocks = 1;
174
+        }
175
+        sha1_multi_block(ctx, hdesc, 1);
176
+    }
177
+    for (i=0; i<4; i++) {
178
+        uint8_t *d = (uint8_t*)&ctx->data[i];
179
+        memset(d+n, 0, 64-8-n);
180
+
181
+        d += 64 - 8;
182
+        uint32_t *d32 = (uint32_t*)d;
183
+        *d32 = ntohl(ctx->Nh);
184
+
185
+        d += 4;
186
+        d32 = (uint32_t*)d;
187
+        *d32 = ntohl(ctx->Nl);
188
+    }
189
+    HASH_DESC hdesc[4];
190
+    for (i=0; i<4; i++) {
191
+        hdesc[i].ptr = (uint8_t*)ctx->data[i];
192
+        hdesc[i].blocks = 1;
193
+    }
194
+    sha1_multi_block(ctx, hdesc, 1);
195
+
196
+    for (i=0; i<4; i++) {
197
+        uint32_t *d32 = (uint32_t*)digest[i];
198
+        *(d32++) = ntohl(ctx->A[i]);
199
+        *(d32++) = ntohl(ctx->B[i]);
200
+        *(d32++) = ntohl(ctx->C[i]);
201
+        *(d32++) = ntohl(ctx->D[i]);
202
+        *d32 = ntohl(ctx->E[i]);
203
+    }
204
+}
205
+
206
+void SHA1_MB_Final8(uint8_t *digest[8], SHA1_MB_CTX *ctx) {
207
+    size_t n = ctx->num;
208
+    uint8_t i;
209
+    for (i=0; i<8; i++) {
210
+        uint8_t *d = (uint8_t*)ctx->data[i];
211
+        *(d+n) = 0x80;
212
+    }
213
+    n++;
214
+    for (i=0; i<8; i++)
215
+        memset(((uint8_t*)ctx->data[i]) + n, 0, 64 - n);
216
+
217
+    if (n > (64 - 8)) {
218
+        n = 0;
219
+        HASH_DESC hdesc[8];
220
+        for (i=0; i<8; i++) {
221
+            hdesc[i].ptr = (uint8_t*)ctx->data[i];
222
+            hdesc[i].blocks = 1;
223
+        }
224
+        sha1_multi_block(ctx, hdesc, 2);
225
+    }
226
+    for (i=0; i<8; i++) {
227
+        uint8_t *d = (uint8_t*)&ctx->data[i];
228
+        memset(d+n, 0, 64-8-n);
229
+
230
+        d += 64 - 8;
231
+        uint32_t *d32 = (uint32_t*)d;
232
+        *d32 = ntohl(ctx->Nh);
233
+
234
+        d += 4;
235
+        d32 = (uint32_t*)d;
236
+        *d32 = ntohl(ctx->Nl);
237
+    }
238
+    HASH_DESC hdesc[8];
239
+    for (i=0; i<8; i++) {
240
+        hdesc[i].ptr = (uint8_t*)ctx->data[i];
241
+        hdesc[i].blocks = 1;
242
+    }
243
+    sha1_multi_block(ctx, hdesc, 2);
244
+
245
+    for (i=0; i<8; i++) {
246
+        uint32_t *d32 = (uint32_t*)digest[i];
247
+        *(d32++) = ntohl(ctx->A[i]);
248
+        *(d32++) = ntohl(ctx->B[i]);
249
+        *(d32++) = ntohl(ctx->C[i]);
250
+        *(d32++) = ntohl(ctx->D[i]);
251
+        *d32 = ntohl(ctx->E[i]);
252
+    }
253
+}
254
+
255
+void ntru_sha1_4way_simd(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]) {
256
+    SHA1_MB_CTX ctx;
257
+    SHA1_MB_Init(&ctx);
258
+    SHA1_MB_Update(&ctx, input, input_len);
259
+    SHA1_MB_Final(digest, &ctx);
260
+}
261
+
262
+void ntru_sha1_8way_simd(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]) {
263
+    SHA1_MB_CTX ctx;
264
+    SHA1_MB_Init8(&ctx);
265
+    SHA1_MB_Update8(&ctx, input, input_len);
266
+    SHA1_MB_Final8(digest, &ctx);
267
+}
268
+
269
+void SHA256_MB_Init(SHA256_MB_CTX *ctx) {
270
+    memset(ctx, 0, sizeof(*ctx));
271
+    __m128i a = _mm_set1_epi32(0x6a09e667);
272
+    __m128i b = _mm_set1_epi32(0xbb67ae85);
273
+    __m128i c = _mm_set1_epi32(0x3c6ef372);
274
+    __m128i d = _mm_set1_epi32(0xa54ff53a);
275
+    __m128i e = _mm_set1_epi32(0x510e527f);
276
+    __m128i f = _mm_set1_epi32(0x9b05688c);
277
+    __m128i g = _mm_set1_epi32(0x1f83d9ab);
278
+    __m128i h = _mm_set1_epi32(0x5be0cd19);
279
+    _mm_storeu_si128((__m128i*)&ctx->A, a);
280
+    _mm_storeu_si128((__m128i*)&ctx->B, b);
281
+    _mm_storeu_si128((__m128i*)&ctx->C, c);
282
+    _mm_storeu_si128((__m128i*)&ctx->D, d);
283
+    _mm_storeu_si128((__m128i*)&ctx->E, e);
284
+    _mm_storeu_si128((__m128i*)&ctx->F, f);
285
+    _mm_storeu_si128((__m128i*)&ctx->G, g);
286
+    _mm_storeu_si128((__m128i*)&ctx->H, h);
287
+}
288
+
289
+void SHA256_MB_Init8(SHA256_MB_CTX *ctx) {
290
+    /* init A[0]..A[3], B[0]..B[3], ... */
291
+    SHA256_MB_Init(ctx);
292
+    /* init A[4]..A[7], B[4]..B[7], ... */
293
+    __m128i a = _mm_set1_epi32(0x6a09e667);
294
+    __m128i b = _mm_set1_epi32(0xbb67ae85);
295
+    __m128i c = _mm_set1_epi32(0x3c6ef372);
296
+    __m128i d = _mm_set1_epi32(0xa54ff53a);
297
+    __m128i e = _mm_set1_epi32(0x510e527f);
298
+    __m128i f = _mm_set1_epi32(0x9b05688c);
299
+    __m128i g = _mm_set1_epi32(0x1f83d9ab);
300
+    __m128i h = _mm_set1_epi32(0x5be0cd19);
301
+    _mm_storeu_si128((__m128i*)&ctx->A[4], a);
302
+    _mm_storeu_si128((__m128i*)&ctx->B[4], b);
303
+    _mm_storeu_si128((__m128i*)&ctx->C[4], c);
304
+    _mm_storeu_si128((__m128i*)&ctx->D[4], d);
305
+    _mm_storeu_si128((__m128i*)&ctx->E[4], e);
306
+    _mm_storeu_si128((__m128i*)&ctx->F[4], f);
307
+    _mm_storeu_si128((__m128i*)&ctx->G[4], g);
308
+    _mm_storeu_si128((__m128i*)&ctx->H[4], h);
309
+}
310
+
311
+void SHA256_MB_Update(SHA256_MB_CTX *ctx, uint8_t *data[4], size_t len) {
312
+    if (len == 0)
313
+        return;
314
+
315
+    uint32_t l = (ctx->Nl + (((uint32_t) len) << 3)) & 0xffffffffUL;
316
+    /*
317
+     * 95-05-24 eay Fixed a bug with the overflow handling, thanks to Wei Dai
318
+     * <weidai@eskimo.com> for pointing it out.
319
+     */
320
+    if (l < ctx->Nl)              /* overflow */
321
+        ctx->Nh++;
322
+    ctx->Nh += (uint32_t) (len >> 29); /* might cause compiler warning on
323
+                                       * 16-bit */
324
+    ctx->Nl = l;
325
+
326
+    uint8_t *data_[4];
327
+    uint8_t i;
328
+    for (i=0; i<4; i++)
329
+        data_[i] = data[i];
330
+
331
+    size_t n = len / 64;
332
+    if (n > 0) {
333
+        HASH_DESC hdesc[4];
334
+        for (i=0; i<4; i++) {
335
+            hdesc[i].ptr = data[i];
336
+            hdesc[i].blocks = n;
337
+        }
338
+        sha256_multi_block(ctx, hdesc, 1);
339
+        n *= 64;
340
+        for (i=0; i<4; i++)
341
+            data_[i] += n;
342
+        len -= n;
343
+    }
344
+
345
+    if (len != 0) {
346
+        ctx->num = (uint32_t)len;
347
+        for (i=0; i<4; i++) {
348
+            uint8_t *d = (uint8_t*)ctx->data[i];
349
+            memcpy(d, data_[i], len);
350
+        }
351
+    }
352
+}
353
+
354
+void SHA256_MB_Update8(SHA256_MB_CTX *ctx, uint8_t *data[8], size_t len) {
355
+    if (len == 0)
356
+        return;
357
+
358
+    uint32_t l = (ctx->Nl + (((uint32_t) len) << 3)) & 0xffffffffUL;
359
+    /*
360
+     * 95-05-24 eay Fixed a bug with the overflow handling, thanks to Wei Dai
361
+     * <weidai@eskimo.com> for pointing it out.
362
+     */
363
+    if (l < ctx->Nl)              /* overflow */
364
+        ctx->Nh++;
365
+    ctx->Nh += (uint32_t) (len >> 29); /* might cause compiler warning on
366
+                                       * 16-bit */
367
+    ctx->Nl = l;
368
+
369
+    uint8_t *data_[8];
370
+    uint8_t i;
371
+    for (i=0; i<8; i++)
372
+        data_[i] = data[i];
373
+
374
+    size_t n = len / 64;
375
+    if (n > 0) {
376
+        HASH_DESC hdesc[8];
377
+        for (i=0; i<8; i++) {
378
+            hdesc[i].ptr = data[i];
379
+            hdesc[i].blocks = n;
380
+        }
381
+        sha256_multi_block(ctx, hdesc, 2);
382
+        n *= 64;
383
+        for (i=0; i<8; i++)
384
+            data_[i] += n;
385
+        len -= n;
386
+    }
387
+
388
+    if (len != 0) {
389
+        ctx->num = (uint32_t)len;
390
+        for (i=0; i<8; i++) {
391
+            uint8_t *d = (uint8_t*)ctx->data[i];
392
+            memcpy(d, data_[i], len);
393
+        }
394
+    }
395
+}
396
+
397
+void SHA256_MB_Final(uint8_t *digest[4], SHA256_MB_CTX *ctx) {
398
+    size_t n = ctx->num;
399
+    uint8_t i;
400
+    for (i=0; i<4; i++) {
401
+        uint8_t *d = (uint8_t*)ctx->data[i];
402
+        *(d+n) = 0x80;
403
+    }
404
+    n++;
405
+    for (i=0; i<4; i++)
406
+        memset(((uint8_t*)ctx->data[i]) + n, 0, 64 - n);
407
+
408
+    if (n > (64 - 8)) {
409
+        n = 0;
410
+        HASH_DESC hdesc[4];
411
+        for (i=0; i<4; i++) {
412
+            hdesc[i].ptr = (uint8_t*)ctx->data[i];
413
+            hdesc[i].blocks = 1;
414
+        }
415
+        sha256_multi_block(ctx, hdesc, 1);
416
+    }
417
+    for (i=0; i<4; i++) {
418
+        uint8_t *d = (uint8_t*)&ctx->data[i];
419
+        memset(d+n, 0, 64-8-n);
420
+
421
+        d += 64 - 8;
422
+        uint32_t *d32 = (uint32_t*)d;
423
+        *d32 = ntohl(ctx->Nh);
424
+
425
+        d += 4;
426
+        d32 = (uint32_t*)d;
427
+        *d32 = ntohl(ctx->Nl);
428
+    }
429
+    HASH_DESC hdesc[4];
430
+    for (i=0; i<4; i++) {
431
+        hdesc[i].ptr = (uint8_t*)ctx->data[i];
432
+        hdesc[i].blocks = 1;
433
+    }
434
+    sha256_multi_block(ctx, hdesc, 1);
435
+
436
+    for (i=0; i<4; i++) {
437
+        uint32_t *d32 = (uint32_t*)digest[i];
438
+        *(d32++) = ntohl(ctx->A[i]);
439
+        *(d32++) = ntohl(ctx->B[i]);
440
+        *(d32++) = ntohl(ctx->C[i]);
441
+        *(d32++) = ntohl(ctx->D[i]);
442
+        *(d32++) = ntohl(ctx->E[i]);
443
+        *(d32++) = ntohl(ctx->F[i]);
444
+        *(d32++) = ntohl(ctx->G[i]);
445
+        *d32 = ntohl(ctx->H[i]);
446
+    }
447
+}
448
+
449
+void SHA256_MB_Final8(uint8_t *digest[8], SHA256_MB_CTX *ctx) {
450
+    size_t n = ctx->num;
451
+    uint8_t i;
452
+    for (i=0; i<8; i++) {
453
+        uint8_t *d = (uint8_t*)ctx->data[i];
454
+        *(d+n) = 0x80;
455
+    }
456
+    n++;
457
+    for (i=0; i<8; i++)
458
+        memset(((uint8_t*)ctx->data[i]) + n, 0, 64 - n);
459
+
460
+    if (n > (64 - 8)) {
461
+        n = 0;
462
+        HASH_DESC hdesc[8];
463
+        for (i=0; i<8; i++) {
464
+            hdesc[i].ptr = (uint8_t*)ctx->data[i];
465
+            hdesc[i].blocks = 1;
466
+        }
467
+        sha256_multi_block(ctx, hdesc, 2);
468
+    }
469
+    for (i=0; i<8; i++) {
470
+        uint8_t *d = (uint8_t*)&ctx->data[i];
471
+        memset(d+n, 0, 64-8-n);
472
+
473
+        d += 64 - 8;
474
+        uint32_t *d32 = (uint32_t*)d;
475
+        *d32 = ntohl(ctx->Nh);
476
+
477
+        d += 4;
478
+        d32 = (uint32_t*)d;
479
+        *d32 = ntohl(ctx->Nl);
480
+    }
481
+    HASH_DESC hdesc[8];
482
+    for (i=0; i<8; i++) {
483
+        hdesc[i].ptr = (uint8_t*)ctx->data[i];
484
+        hdesc[i].blocks = 1;
485
+    }
486
+    sha256_multi_block(ctx, hdesc, 2);
487
+
488
+    for (i=0; i<8; i++) {
489
+        uint32_t *d32 = (uint32_t*)digest[i];
490
+        *(d32++) = ntohl(ctx->A[i]);
491
+        *(d32++) = ntohl(ctx->B[i]);
492
+        *(d32++) = ntohl(ctx->C[i]);
493
+        *(d32++) = ntohl(ctx->D[i]);
494
+        *(d32++) = ntohl(ctx->E[i]);
495
+        *(d32++) = ntohl(ctx->F[i]);
496
+        *(d32++) = ntohl(ctx->G[i]);
497
+        *d32 = ntohl(ctx->H[i]);
498
+    }
499
+}
500
+
501
+void ntru_sha256_4way_simd(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]) {
502
+    SHA256_MB_CTX ctx;
503
+    SHA256_MB_Init(&ctx);
504
+    SHA256_MB_Update(&ctx, input, input_len);
505
+    SHA256_MB_Final(digest, &ctx);
506
+}
507
+
508
+void ntru_sha256_8way_simd(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]) {
509
+    SHA256_MB_CTX ctx;
510
+    SHA256_MB_Init8(&ctx);
511
+    SHA256_MB_Update8(&ctx, input, input_len);
512
+    SHA256_MB_Final8(digest, &ctx);
513
+}

+ 14 - 0
src/hash_simd.h

@@ -0,0 +1,14 @@
1
+#ifndef NTRU_HASH_SIMD_H
2
+#define NTRU_HASH_SIMD_H
3
+
4
+#include <stdint.h>
5
+
6
+void ntru_sha1_4way_simd(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]);
7
+
8
+void ntru_sha1_8way_simd(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]);
9
+
10
+void ntru_sha256_4way_simd(uint8_t *input[4], uint16_t input_len, uint8_t *digest[4]);
11
+
12
+void ntru_sha256_8way_simd(uint8_t *input[8], uint16_t input_len, uint8_t *digest[8]);
13
+
14
+#endif   /* NTRU_HASH_SIMD_H */

+ 18 - 0
src/ntru.c

@@ -13,6 +13,14 @@
13 13
 const int8_t NTRU_COEFF1_TABLE[] = {0, 0, 0, 1, 1, 1, -1, -1};
14 14
 const int8_t NTRU_COEFF2_TABLE[] = {0, 1, -1, 0, 1, -1, 0, 1};
15 15
 
16
+void ntru_set_optimized_impl() {
17
+#ifdef NTRU_DETECT_SIMD
18
+    __builtin_cpu_init();
19
+#endif
20
+    ntru_set_optimized_impl_poly();
21
+    ntru_set_optimized_impl_hash();
22
+}
23
+
16 24
 /* Generates a random g. If NTRU_CHECK_INVERTIBILITY_G, g will be invertible mod q */
17 25
 uint8_t ntru_gen_g(const NtruEncParams *params, NtruPrivPoly *g, NtruRandContext *rand_ctx) {
18 26
     uint16_t N = params->N;
@@ -93,6 +101,8 @@ uint8_t ntru_gen_key_pair_single(const NtruEncParams *params, NtruEncPrivKey *pr
93 101
 }
94 102
 
95 103
 uint8_t ntru_gen_key_pair(const NtruEncParams *params, NtruEncKeyPair *kp, NtruRandContext *rand_ctx) {
104
+    ntru_set_optimized_impl();
105
+
96 106
     NtruIntPoly fq;
97 107
     uint8_t result = ntru_gen_key_pair_single(params, &kp->priv, &kp->pub, &fq, rand_ctx);
98 108
     ntru_clear_int(&fq);
@@ -100,6 +110,8 @@ uint8_t ntru_gen_key_pair(const NtruEncParams *params, NtruEncKeyPair *kp, NtruR
100 110
 }
101 111
 
102 112
 uint8_t ntru_gen_key_pair_multi(const NtruEncParams *params, NtruEncPrivKey *priv, NtruEncPubKey *pub, NtruRandContext *rand_ctx, uint32_t num_pub) {
113
+    ntru_set_optimized_impl();
114
+
103 115
     uint16_t q = params->q;
104 116
     NtruIntPoly fq;
105 117
     uint8_t result = ntru_gen_key_pair_single(params, priv, pub, &fq, rand_ctx);
@@ -123,6 +135,8 @@ uint8_t ntru_gen_key_pair_multi(const NtruEncParams *params, NtruEncPrivKey *pri
123 135
 }
124 136
 
125 137
 uint8_t ntru_gen_pub(const NtruEncParams *params, NtruEncPrivKey *priv, NtruEncPubKey *pub, NtruRandContext *rand_ctx) {
138
+    ntru_set_optimized_impl();
139
+
126 140
     uint16_t q = params->q;
127 141
     NtruIntPoly fq;
128 142
     if (!ntru_invert(&priv->t, q-1, &fq))
@@ -368,6 +382,8 @@ uint8_t ntru_check_rep_weight(NtruIntPoly *p, uint16_t dm0) {
368 382
 }
369 383
 
370 384
 uint8_t ntru_encrypt(uint8_t *msg, uint16_t msg_len, NtruEncPubKey *pub, const NtruEncParams *params, NtruRandContext *rand_ctx, uint8_t *enc) {
385
+    ntru_set_optimized_impl();
386
+
371 387
     uint16_t N = params->N;
372 388
     uint16_t q = params->q;
373 389
     uint16_t db = params->db;
@@ -437,6 +453,8 @@ void ntru_decrypt_poly(NtruIntPoly *e, NtruEncPrivKey *priv, uint16_t q, NtruInt
437 453
 }
438 454
 
439 455
 uint8_t ntru_decrypt(uint8_t *enc, NtruEncKeyPair *kp, const NtruEncParams *params, uint8_t *dec, uint16_t *dec_len) {
456
+    ntru_set_optimized_impl();
457
+
440 458
     uint16_t N = params->N;
441 459
     uint16_t q = params->q;
442 460
     uint16_t db = params->db;

+ 81 - 711
src/poly.c

@@ -1,19 +1,14 @@
1 1
 #include <stdlib.h>
2 2
 #include <string.h>
3
-#ifdef __SSSE3__
4
-#include <tmmintrin.h>
5
-#endif
6
-#ifdef __AVX2__
7
-#include <immintrin.h>
8
-#endif
9 3
 #include "poly.h"
4
+#include "poly_ssse3.h"
5
+#include "poly_avx2.h"
10 6
 #include "rand.h"
11 7
 #include "err.h"
12 8
 #include "arith.h"
13 9
 #include "encparams.h"
14 10
 #include "ntru_endian.h"
15 11
 
16
-#define NTRU_SPARSE_THRESH 14
17 12
 #define NTRU_KARATSUBA_THRESH_16 40
18 13
 #define NTRU_KARATSUBA_THRESH_64 120
19 14
 
@@ -116,18 +111,6 @@ void ntru_neg_mod(NtruIntPoly *a, uint16_t modulus) {
116 111
         a->coeffs[i] = modulus - a->coeffs[i];
117 112
 }
118 113
 
119
-uint8_t ntru_mult_int(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
120
-#ifdef __AVX2__
121
-    return ntru_mult_int_avx2(a, b, c, mod_mask);
122
-#elif __SSSE3__
123
-    return ntru_mult_int_sse(a, b, c, mod_mask);
124
-#elif _LP64
125
-    return ntru_mult_int_64(a, b, c, mod_mask);
126
-#else
127
-    return ntru_mult_int_16(a, b, c, mod_mask);
128
-#endif
129
-}
130
-
131 114
 void ntru_mult_int_16_base(int16_t *a, int16_t *b, int16_t *c, uint16_t len, uint16_t N, uint16_t mod_mask) {
132 115
     memset(c, 0, 2*(2*len-1));   /* only needed if N < NTRU_KARATSUBA_THRESH_16 */
133 116
     uint16_t c_idx = 0;
@@ -339,144 +322,6 @@ uint8_t ntru_mult_int_64(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_
339 322
     return 1;
340 323
 }
341 324
 
342
-#ifdef __SSSE3__
343
-uint8_t ntru_mult_int_sse(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
344
-    uint16_t N = a->N;
345
-    if (N != b->N)
346
-        return 0;
347
-    c->N = N;
348
-    int16_t c_coeffs[2*NTRU_INT_POLY_SIZE];   /* double capacity for intermediate result */
349
-    memset(&c_coeffs, 0, sizeof(c_coeffs));
350
-
351
-    uint16_t k;
352
-    for (k=N; k<NTRU_INT_POLY_SIZE; k++) {
353
-        a->coeffs[k] = 0;
354
-        b->coeffs[k] = 0;
355
-    }
356
-    for (k=0; k<N; k+=8) {
357
-        uint8_t j;
358
-
359
-        /* process coeffs in 8x8 blocks */
360
-        __m128i b128[8];
361
-        for (j=0; j<8; j++)
362
-            b128[j] = _mm_set1_epi16(b->coeffs[k+j]);
363
-
364
-        /* indices 0..7 */
365
-        __m128i a128 = _mm_lddqu_si128((__m128i*)&a->coeffs[0]);
366
-        __m128i c128 = _mm_lddqu_si128((__m128i*)&c_coeffs[k]);
367
-        for (j=0; j<8; j++) {
368
-            __m128i product = _mm_mullo_epi16(a128, b128[j]);
369
-            c128 = _mm_add_epi16(c128, product);
370
-            a128 = _mm_slli_si128(a128, 2);
371
-        }
372
-        _mm_storeu_si128((__m128i*)&c_coeffs[k], c128);
373
-
374
-        /* indices 8... */
375
-        uint16_t i;
376
-        for (i=8; i<N+8; i+=8) {
377
-            __m128i c128 = _mm_lddqu_si128((__m128i*)&c_coeffs[k+i]);
378
-            __m128i a128_0 = _mm_lddqu_si128((__m128i*)&a->coeffs[i-7]);
379
-            __m128i a128_1 = _mm_lddqu_si128((__m128i*)&a->coeffs[i]);
380
-            for (j=0; j<8; j++) {
381
-                __m128i product = _mm_mullo_epi16(a128_1, b128[j]);
382
-                c128 = _mm_add_epi16(c128, product);
383
-
384
-                a128_0 = _mm_slli_si128(a128_0, 2);
385
-                a128_1 = _mm_alignr_epi8(a128_1, a128_0, 14);
386
-            }
387
-            _mm_storeu_si128((__m128i*)&c_coeffs[k+i], c128);
388
-        }
389
-    }
390
-    /* no need to SSE-ify the following loop b/c the compiler auto-vectorizes it */
391
-    for (k=0; k<N; k++)
392
-        c->coeffs[k] = c_coeffs[k] + c_coeffs[N+k];
393
-
394
-    ntru_mod_mask(c, mod_mask);
395
-    return 1;
396
-}
397
-#endif   /* __SSSE3__ */
398
-
399
-#ifdef __AVX2__
400
-uint8_t ntru_mult_int_avx2(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
401
-    uint16_t N = a->N;
402
-    if (N != b->N)
403
-        return 0;
404
-    c->N = N;
405
-    int16_t c_coeffs[2*NTRU_INT_POLY_SIZE];   /* double capacity for intermediate result */
406
-    memset(&c_coeffs, 0, sizeof(c_coeffs));
407
-
408
-    uint16_t k;
409
-    for (k=N; k<NTRU_INT_POLY_SIZE; k++) {
410
-        a->coeffs[k] = 0;
411
-        b->coeffs[k] = 0;
412
-    }
413
-    for (k=0; k<N; k+=16) {
414
-        uint8_t j;
415
-
416
-        __m256i b256[8];
417
-        for (j=0; j<8; j++) {
418
-
419
-            b256[j] = _mm256_inserti128_si256(_mm256_castsi128_si256(
420
-                    _mm_set1_epi16(b->coeffs[k+j])),
421
-                    _mm_set1_epi16(b->coeffs[k+8+j]),1);
422
-        }
423
-
424
-        /* indices 0..7 */
425
-        __m128i tmp_a = _mm_lddqu_si128((__m128i*)&a->coeffs[0]);
426
-        __m256i a256 = _mm256_broadcastsi128_si256(tmp_a);
427
-
428
-        __m256i c256 = _mm256_lddqu_si256((__m256i*)&c_coeffs[k]);
429
-        for (j=0; j<8; j++) {
430
-            __m256i product = _mm256_mullo_epi16(a256, b256[j]);
431
-            c256 = _mm256_add_epi16(c256, product);
432
-            a256 = _mm256_bslli_epi128(a256, 2);
433
-        }
434
-        _mm256_storeu_si256((__m256i*)&c_coeffs[k], c256);
435
-
436
-        /* indices 8... */
437
-        uint16_t i;
438
-        for (i=8; i<N+8; i+=8) {
439
-            __m256i c256 = _mm256_lddqu_si256((__m256i*)&c_coeffs[k+i]);
440
-
441
-            __m128i tmp_0 = _mm_lddqu_si128((__m128i*)&a->coeffs[i-7]);
442
-            __m256i a256_0 = _mm256_broadcastsi128_si256(tmp_0);
443
-
444
-            __m128i tmp_1 = _mm_lddqu_si128((__m128i*)&a->coeffs[i]);
445
-            __m256i a256_1 = _mm256_broadcastsi128_si256(tmp_1);
446
-
447
-
448
-            for (j=0; j<8; j++) {
449
-                __m256i product = _mm256_mullo_epi16(a256_1, b256[j]);
450
-                c256 = _mm256_add_epi16(c256, product);
451
-
452
-                a256_0 = _mm256_bslli_epi128(a256_0, 2);
453
-                a256_1 = _mm256_alignr_epi8(a256_1, a256_0, 14);
454
-            }
455
-            _mm256_storeu_si256((__m256i*)&c_coeffs[k+i], c256);
456
-        }
457
-    }
458
-
459
-    /* no need to SSE-ify the following loop b/c the compiler auto-vectorizes it */
460
-    for (k=0; k<N; k++)
461
-        c->coeffs[k] = c_coeffs[k] + c_coeffs[N+k];
462
-
463
-    ntru_mod_mask(c, mod_mask);
464
-    return 1;
465
-}
466
-#endif   /* __AVX2__ */
467
-
468
-uint8_t ntru_mult_tern(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
469
-#ifdef __AVX2__
470
-    return ntru_mult_tern_avx2(a, b, c, mod_mask);
471
-#elif __SSSE3__
472
-    return ntru_mult_tern_sse(a, b, c, mod_mask);
473
-#elif _LP64
474
-    return ntru_mult_tern_64(a, b, c, mod_mask);
475
-#else
476
-    return ntru_mult_tern_32(a, b, c, mod_mask);
477
-#endif
478
-}
479
-
480 325
 uint8_t ntru_mult_tern_32(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
481 326
     uint16_t N = a->N;
482 327
     if (N != b->N)
@@ -628,326 +473,6 @@ uint8_t ntru_mult_tern_64(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint1
628 473
     return 1;
629 474
 }
630 475
 
631
-#ifdef __SSSE3__
632
-/* Optimized for small df */
633
-uint8_t ntru_mult_tern_sse_sparse(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
634
-    uint16_t N = a->N;
635
-    if (N != b->N)
636
-        return 0;
637
-    memset(&c->coeffs, 0, N * sizeof c->coeffs[0]);
638
-    c->N = N;
639
-
640
-    /* add coefficients that are multiplied by 1 */
641
-    uint16_t i;
642
-    for (i=0; i<b->num_ones; i++) {
643
-        int16_t j;
644
-        int16_t k = b->ones[i];
645
-        uint16_t j_end = N<b->ones[i] ? 0 : N-b->ones[i];
646
-        /* it is safe not to truncate the last block of 8 coefficients */
647
-        /* because there is extra room at the end of the coeffs array  */
648
-        for (j=0; j<j_end; j+=8,k+=8) {
649
-            __m128i ck = _mm_lddqu_si128((__m128i*)&c->coeffs[k]);
650
-            __m128i aj = _mm_lddqu_si128((__m128i*)&a->coeffs[j]);
651
-            __m128i ca = _mm_add_epi16(ck, aj);
652
-            _mm_storeu_si128((__m128i*)&c->coeffs[k], ca);
653
-        }
654
-        j = j_end;
655
-        for (k=0; j<N-7; j+=8,k+=8) {
656
-            __m128i ck = _mm_lddqu_si128((__m128i*)&c->coeffs[k]);
657
-            __m128i aj = _mm_lddqu_si128((__m128i*)&a->coeffs[j]);
658
-            __m128i ca = _mm_add_epi16(ck, aj);
659
-            _mm_storeu_si128((__m128i*)&c->coeffs[k], ca);
660
-        }
661
-        for (; j<N; j++,k++)
662
-            c->coeffs[k] += a->coeffs[j];
663
-    }
664
-
665
-    /* subtract coefficients that are multiplied by -1 */
666
-    for (i=0; i<b->num_neg_ones; i++) {
667
-        int16_t j;
668
-        int16_t k = b->neg_ones[i];
669
-        uint16_t j_end = N<b->neg_ones[i] ? 0 : N-b->neg_ones[i];
670
-        /* it is safe not to truncate the last block of 8 coefficients */
671
-        /* because there is extra room at the end of the coeffs array  */
672
-        for (j=0; j<j_end; j+=8,k+=8) {
673
-            __m128i ck = _mm_lddqu_si128((__m128i*)&c->coeffs[k]);
674
-            __m128i aj = _mm_lddqu_si128((__m128i*)&a->coeffs[j]);
675
-            __m128i ca = _mm_sub_epi16(ck, aj);
676
-            _mm_storeu_si128((__m128i*)&c->coeffs[k], ca);
677
-        }
678
-        j = j_end;
679
-        for (k=0; j<N-7; j+=8,k+=8) {
680
-            __m128i ck = _mm_lddqu_si128((__m128i*)&c->coeffs[k]);
681
-            __m128i aj = _mm_lddqu_si128((__m128i*)&a->coeffs[j]);
682
-            __m128i ca = _mm_sub_epi16(ck, aj);
683
-            _mm_storeu_si128((__m128i*)&c->coeffs[k], ca);
684
-        }
685
-        for (; j<N; j++,k++)
686
-            c->coeffs[k] -= a->coeffs[j];
687
-    }
688
-
689
-    ntru_mod_mask(c, mod_mask);
690
-    return 1;
691
-}
692
-
693
-/* Optimized for large df */
694
-uint8_t ntru_mult_tern_sse_dense(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
695
-    uint16_t N = a->N;
696
-    if (N != b->N)
697
-        return 0;
698
-    c->N = N;
699
-
700
-    uint16_t i;
701
-    for (i=N; i<NTRU_INT_POLY_SIZE; i++)
702
-        a->coeffs[i] = 0;
703
-    int16_t c_coeffs_arr[8+2*NTRU_INT_POLY_SIZE];   /* double capacity for intermediate result + another 8 */
704
-    int16_t *c_coeffs = c_coeffs_arr + 8;
705
-    memset(&c_coeffs_arr, 0, sizeof(c_coeffs_arr));
706
-
707
-    __m128i a_coeffs0[8];
708
-    a_coeffs0[0] = _mm_lddqu_si128((__m128i*)&a->coeffs[0]);
709
-    for (i=1; i<8; i++)
710
-        a_coeffs0[i] = _mm_slli_si128(a_coeffs0[i-1], 2);
711
-
712
-    /* add coefficients that are multiplied by 1 */
713
-    for (i=0; i<b->num_ones; i++) {
714
-        int16_t k = b->ones[i];
715
-        /* process the first num_coeffs0 coefficients, 1<=num_coeffs0<=8 */
716
-        uint8_t num_bytes0 = 16 - (((size_t)&c_coeffs[k])%16);
717
-        uint8_t num_coeffs0 = num_bytes0 / 2;   /* c_coeffs[k+num_coeffs0] is 16-byte aligned */
718
-        k -= 8 - num_coeffs0;
719
-        __m128i *ck = (__m128i*)&c_coeffs[k];
720
-        __m128i aj = a_coeffs0[8-num_coeffs0];
721
-        __m128i ca = _mm_add_epi16(*ck, aj);
722
-        _mm_store_si128(ck, ca);
723
-        k += 8;
724
-        /* process the remaining coefficients in blocks of 8. */
725
-        /* it is safe not to truncate the last block of 8 coefficients */
726
-        /* because there is extra room at the end of the coeffs array  */
727
-        ck = (__m128i*)&c_coeffs[k];
728
-        int16_t j;
729
-        for (j=num_coeffs0; j<N; j+=8,k+=8) {
730
-            __m128i aj = _mm_lddqu_si128((__m128i*)&a->coeffs[j]);
731
-            __m128i ca = _mm_add_epi16(*ck, aj);
732
-            _mm_store_si128(ck, ca);
733
-            ck++;
734
-        }
735
-    }
736
-
737
-    /* subtract coefficients that are multiplied by -1 */
738
-    for (i=0; i<b->num_neg_ones; i++) {
739
-        int16_t k = b->neg_ones[i];
740
-        /* process the first num_coeffs0 coefficients, 1<=num_coeffs0<=8 */
741
-        uint8_t num_bytes0 = 16 - (((size_t)&c_coeffs[k])%16);
742
-        uint8_t num_coeffs0 = num_bytes0 / 2;   /* c_coeffs[k+num_coeffs0] is 16-byte aligned */
743
-        k -= 8 - num_coeffs0;
744
-        __m128i *ck = (__m128i*)&c_coeffs[k];
745
-        __m128i aj = a_coeffs0[8-num_coeffs0];
746
-        __m128i ca = _mm_sub_epi16(*ck, aj);
747
-        _mm_store_si128(ck, ca);
748
-        k += 8;
749
-        /* process the remaining coefficients in blocks of 8. */
750
-        /* it is safe not to truncate the last block of 8 coefficients */
751
-        /* because there is extra room at the end of the coeffs array  */
752
-        ck = (__m128i*)&c_coeffs[k];
753
-        int16_t j;
754
-        for (j=num_coeffs0; j<N; j+=8,k+=8) {
755
-            __m128i aj = _mm_lddqu_si128((__m128i*)&a->coeffs[j]);
756
-            __m128i ca = _mm_sub_epi16(*ck, aj);
757
-            _mm_store_si128(ck, ca);
758
-            ck++;
759
-        }
760
-    }
761
-
762
-    /* reduce c_coeffs[0..2N-1] to [0..N-1] and apply mod_mask to reduce values mod q */
763
-    /* handle the first coefficients individually if c_coeffs is not 16-byte aligned */
764
-    for (i=0; ((size_t)&c_coeffs[i])%16; i++)
765
-        c->coeffs[i] = (c_coeffs[i] + c_coeffs[N+i]) & mod_mask;
766
-    /* handle the remaining ones in blocks of 8 */
767
-    __m128i mod_mask_128 = _mm_set1_epi16(mod_mask);
768
-    __m128i *ci = (__m128i*)(&c_coeffs[i]);
769
-    for (; i<N; i+=8) {
770
-        __m128i c128_1 = _mm_lddqu_si128((__m128i*)&c_coeffs[i+N]);
771
-        __m128i c128_0 = _mm_add_epi16(*ci, c128_1);
772
-        c128_0 = _mm_and_si128(c128_0, mod_mask_128);
773
-        _mm_storeu_si128((__m128i*)&c->coeffs[i], c128_0);
774
-        ci++;
775
-    }
776
-
777
-    return 1;
778
-}
779
-
780
-uint8_t ntru_mult_tern_sse(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
781
-    if (b->num_ones<NTRU_SPARSE_THRESH && b->num_neg_ones<NTRU_SPARSE_THRESH)
782
-        return ntru_mult_tern_sse_sparse(a, b, c, mod_mask);
783
-    else
784
-        return ntru_mult_tern_sse_dense(a, b, c, mod_mask);
785
-}
786
-#endif   /* __SSSE3__ */
787
-
788
-#ifdef __AVX2__
789
-/* Optimized for small df */
790
-uint8_t ntru_mult_tern_avx2_sparse(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
791
-    uint16_t N = a->N;
792
-    if (N != b->N)
793
-        return 0;
794
-    memset(&c->coeffs, 0, N * sizeof c->coeffs[0]);
795
-    c->N = N;
796
-
797
-    /* add coefficients that are multiplied by 1 */
798
-    uint16_t i;
799
-    for (i=0; i<b->num_ones; i++) {
800
-        int16_t j;
801
-        int16_t k = b->ones[i];
802
-        uint16_t j_end = N<b->ones[i] ? 0 : N-b->ones[i];
803
-        /* it is safe not to truncate the last block of 8 coefficients */
804
-        /* because there is extra room at the end of the coeffs array  */
805
-        for (j=0; j<j_end; j+=16,k+=16) {
806
-            __m256i ck = _mm256_lddqu_si256((__m256i*)&c->coeffs[k]);
807
-            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
808
-            __m256i ca = _mm256_add_epi16(ck, aj);
809
-            _mm256_storeu_si256((__m256i*)&c->coeffs[k], ca);
810
-        }
811
-        j = j_end;
812
-        for (k=0; j<N-15; j+=16,k+=16) {
813
-            __m256i ck = _mm256_lddqu_si256((__m256i*)&c->coeffs[k]);
814
-            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
815
-            __m256i ca = _mm256_add_epi16(ck, aj);
816
-            _mm256_storeu_si256((__m256i*)&c->coeffs[k], ca);
817
-        }
818
-        for (; j<N; j++,k++)
819
-            c->coeffs[k] += a->coeffs[j];
820
-    }
821
-    /* subtract coefficients that are multiplied by -1 */
822
-    for (i=0; i<b->num_neg_ones; i++) {
823
-        int16_t j;
824
-        int16_t k = b->neg_ones[i];
825
-        uint16_t j_end = N<b->neg_ones[i] ? 0 : N-b->neg_ones[i];
826
-        /* it is safe not to truncate the last block of 8 coefficients */
827
-        /* because there is extra room at the end of the coeffs array  */
828
-        for (j=0; j<j_end; j+=16,k+=16) {
829
-            __m256i ck = _mm256_lddqu_si256((__m256i*)&c->coeffs[k]);
830
-            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
831
-            __m256i ca = _mm256_sub_epi16(ck, aj);
832
-            _mm256_storeu_si256((__m256i*)&c->coeffs[k], ca);
833
-        }
834
-        j = j_end;
835
-        for (k=0; j<N-15; j+=16,k+=16) {
836
-            __m256i ck = _mm256_lddqu_si256((__m256i*)&c->coeffs[k]);
837
-            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
838
-            __m256i ca = _mm256_sub_epi16(ck, aj);
839
-            _mm256_storeu_si256((__m256i*)&c->coeffs[k], ca);
840
-        }
841
-        for (; j<N; j++,k++)
842
-            c->coeffs[k] -= a->coeffs[j];
843
-    }
844
-
845
-    ntru_mod_mask(c, mod_mask);
846
-    return 1;
847
-}
848
-
849
-/* Optimized for large df */
850
-uint8_t ntru_mult_tern_avx2_dense(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
851
-    uint16_t N = a->N;
852
-    if (N != b->N)
853
-        return 0;
854
-    c->N = N;
855
-
856
-    uint16_t i;
857
-    for (i=N; i<NTRU_INT_POLY_SIZE; i++)
858
-        a->coeffs[i] = 0;
859
-    int16_t c_coeffs_arr[16+2*NTRU_INT_POLY_SIZE];   /* double capacity for intermediate result + another 8 */
860
-    int16_t *c_coeffs = c_coeffs_arr + 16;
861
-    memset(&c_coeffs_arr, 0, sizeof(c_coeffs_arr));
862
-
863
-    __m256i a_coeffs0[16];
864
-    a_coeffs0[0] = _mm256_lddqu_si256((__m256i*)&a->coeffs[0]);
865
-
866
-    for (i=1; i<16; i++) {
867
-        /* Emulate the SSE full-register shifting behaviour in AVX2 (the  */
868
-        /* corresponding _mm256_slli_si256 instruction shifts the two */
869
-        /* 128-bit lanes independently instead of the whole register). */
870
-        /* Two AVX2 instructions are needed for this. */
871
-        __m256i mask = _mm256_permute2x128_si256(a_coeffs0[i-1], a_coeffs0[i-1], _MM_SHUFFLE(0,0,2,0) );
872
-        a_coeffs0[i] = _mm256_alignr_epi8(a_coeffs0[i-1],mask,14);
873
-    }
874
-
875
-    /* add coefficients that are multiplied by 1 */
876
-    for (i=0; i<b->num_ones; i++) {
877
-        int16_t k = b->ones[i];
878
-        /* process the first num_coeffs0 coefficients, 1<=num_coeffs0<=8 */
879
-        uint8_t num_bytes0 = 32 - (((size_t)&c_coeffs[k])%32);
880
-        uint8_t num_coeffs0 = num_bytes0 / 2;   /* c_coeffs[k+num_coeffs0] is 32-byte aligned */
881
-        k -= 16 - num_coeffs0;
882
-        __m256i *ck = (__m256i*)&c_coeffs[k];
883
-        __m256i aj = a_coeffs0[16-num_coeffs0];
884
-        __m256i ca = _mm256_add_epi16(*ck, aj);
885
-        _mm256_store_si256(ck, ca);
886
-        k += 16;
887
-        /* process the remaining coefficients in blocks of 16. */
888
-        /* it is safe not to truncate the last block of 16 coefficients */
889
-        /* because there is extra room at the end of the coeffs array  */
890
-        ck = (__m256i*)&c_coeffs[k];
891
-        int16_t j;
892
-        for (j=num_coeffs0; j<N; j+=16,k+=16) {
893
-            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
894
-            __m256i ca = _mm256_add_epi16(*ck, aj);
895
-            _mm256_store_si256(ck, ca);
896
-            ck++;
897
-        }
898
-    }
899
-
900
-    /* subtract coefficients that are multiplied by -1 */
901
-    for (i=0; i<b->num_neg_ones; i++) {
902
-        int16_t k = b->neg_ones[i];
903
-        /* process the first num_coeffs0 coefficients, 1<=num_coeffs0<=8 */
904
-        uint8_t num_bytes0 = 32 - (((size_t)&c_coeffs[k])%32);
905
-        uint8_t num_coeffs0 = num_bytes0 / 2;   /* c_coeffs[k+num_coeffs0] is 32-byte aligned */
906
-        k -= 16 - num_coeffs0;
907
-        __m256i *ck = (__m256i*)&c_coeffs[k];
908
-        __m256i aj = a_coeffs0[16-num_coeffs0];
909
-        __m256i ca = _mm256_sub_epi16(*ck, aj);
910
-        _mm256_store_si256(ck, ca);
911
-        k += 16;
912
-        /* process the remaining coefficients in blocks of 16. */
913
-        /* it is safe not to truncate the last block of 16 coefficients */
914
-        /* because there is extra room at the end of the coeffs array  */
915
-        ck = (__m256i*)&c_coeffs[k];
916
-        int16_t j;
917
-        for (j=num_coeffs0; j<N; j+=16,k+=16) {
918
-            __m256i aj = _mm256_lddqu_si256((__m256i*)&a->coeffs[j]);
919
-            __m256i ca = _mm256_sub_epi16(*ck, aj);
920
-            _mm256_store_si256(ck, ca);
921
-            ck++;
922
-        }
923
-    }
924
-
925
-    /* reduce c_coeffs[0..2N-1] to [0..N-1] and apply mod_mask to reduce values mod q */
926
-    /* handle the first coefficients individually if c_coeffs is not 16-byte aligned */
927
-    for (i=0; ((size_t)&c_coeffs[i])%32; i++)
928
-        c->coeffs[i] = (c_coeffs[i] + c_coeffs[N+i]) & mod_mask;
929
-    /* handle the remaining ones in blocks of 16 */
930
-    __m256i mod_mask_256 = _mm256_set1_epi16(mod_mask);
931
-    __m256i *ci = (__m256i*)(&c_coeffs[i]);
932
-    for (; i<N; i+=16) {
933
-        __m256i c256_1 = _mm256_lddqu_si256((__m256i*)&c_coeffs[i+N]);
934
-        __m256i c256_0 = _mm256_add_epi16(*ci, c256_1);
935
-        c256_0 = _mm256_and_si256(c256_0, mod_mask_256);
936
-        _mm256_storeu_si256((__m256i*)&c->coeffs[i], c256_0);
937
-        ci++;
938
-    }
939
-
940
-    return 1;
941
-}
942
-
943
-uint8_t ntru_mult_tern_avx2(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
944
-    if (b->num_ones<NTRU_SPARSE_THRESH && b->num_neg_ones<NTRU_SPARSE_THRESH)
945
-        return ntru_mult_tern_avx2_sparse(a, b, c, mod_mask);
946
-    else
947
-        return ntru_mult_tern_avx2_dense(a, b, c, mod_mask);
948
-}
949
-#endif   /* __AVX2__ */
950
-
951 476
 #ifndef NTRU_AVOID_HAMMING_WT_PATENT
952 477
 uint8_t ntru_mult_prod(NtruIntPoly *a, NtruProdPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
953 478
     uint16_t N = a->N;
@@ -1207,88 +732,6 @@ void ntru_to_arr_32(NtruIntPoly *p, uint16_t q, uint8_t *a) {
1207 732
     }
1208 733
 }
1209 734
 
1210
-#ifdef __SSSE3__
1211
-void ntru_to_arr_sse_2048(NtruIntPoly *p, uint8_t *a) {
1212
-    /* mask{n} masks bits n..n+10 except for mask64 which masks bits 64..66 */
1213
-    __m128i mask0 = {(1<<11)-1, 0};
1214
-    __m128i mask11 = _mm_slli_epi64(mask0, 11);
1215
-    __m128i mask22 = _mm_slli_epi64(mask11, 11);
1216
-    __m128i mask33 = _mm_slli_epi64(mask22, 11);
1217
-    __m128i mask44 = _mm_slli_epi64(mask33, 11);
1218
-    __m128i mask55 = {(uint64_t)((1<<9)-1) << 55, 3};
1219
-    __m128i mask64 = {0, 3};
1220
-    __m128i mask66 = {0, ((1<<11)-1) << 2};
1221
-    __m128i mask77 = _mm_slli_epi64(mask66, 11);
1222
-    __m128i mask88 = _mm_slli_epi64(mask77, 11);
1223
-    __m128i mask99 = _mm_slli_epi64(mask88, 11);
1224
-
1225
-    uint16_t a_idx = 0;
1226
-    uint16_t p_idx;
1227
-    uint16_t N = p->N;
1228
-    for (p_idx=0; p_idx<N-10; p_idx+=8) {
1229
-        __m128i p128 = _mm_lddqu_si128((__m128i*)&p->coeffs[p_idx]);   /* 8 coeffs of p starting at p_idx */
1230
-        __m128i a128 = _mm_and_si128(p128, mask0);                                  /* bits [0..10]    -> [0..10]  */
1231
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 5), mask11));       /* [16..26]   -> [11..21] */
1232
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 10), mask22));      /* [32..42]   -> [22..32] */
1233
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 15), mask33));      /* [48..58]   -> [33..43] */
1234
-        __m128i p128_64 = _mm_srli_si128(p128, 8);
1235
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_slli_epi64(p128_64, 44), mask44));   /* [64..74]   -> [44..54] */
1236
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_slli_epi64(p128_64, 39), mask55));   /* [80..88]   -> [55..63] */
1237
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 25), mask64));      /* [89..90]   -> [64..65] */
1238
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 30), mask66));      /* [96..111]  -> [66..76] */
1239
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 35), mask77));      /* [112..127] -> [77..87] */
1240
-        _mm_storeu_si128((__m128i*)&a[a_idx], a128);
1241
-        a_idx += 11;
1242
-    }
1243
-
1244
-    /* remaining coeffs (up to 10) */
1245
-    __m128i p128 = _mm_lddqu_si128((__m128i*)&p->coeffs[p_idx]);   /* 8 coeffs of p starting at p_idx */
1246
-    __m128i a128 = _mm_setzero_si128();
1247
-    if (N-p_idx > 0)
1248
-        a128 = _mm_and_si128(p128, mask0);                                          /* bits [0..10]    -> [0..10]  */
1249
-    if (N-p_idx > 1)
1250
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 5), mask11));       /* [16..26]   -> [11..21] */
1251
-    if (N-p_idx > 2)
1252
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 10), mask22));      /* [32..42]   -> [22..32] */
1253
-    if (N-p_idx > 3)
1254
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 15), mask33));      /* [48..58]   -> [33..43] */
1255
-    __m128i p128_64 = _mm_srli_si128(p128, 8);
1256
-    if (N-p_idx > 4)
1257
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_slli_epi64(p128_64, 44), mask44));   /* [64..74]   -> [44..54] */
1258
-    if (N-p_idx > 5) {
1259
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_slli_epi64(p128_64, 39), mask55));   /* [80..88]   -> [55..63] */
1260
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 25), mask64));      /* [89..90]   -> [64..65] */
1261
-    }
1262
-    if (N-p_idx > 6)
1263
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 30), mask66));      /* [96..111]  -> [66..76] */
1264
-    if (N-p_idx > 7)
1265
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_srli_epi64(p128, 35), mask77));      /* [112..127] -> [77..87] */
1266
-    if (N-p_idx > 8) {
1267
-        p128 = _mm_lddqu_si128((__m128i*)&p->coeffs[p_idx+8]);           /* coeffs p_idx+8 through p_idx+15 */
1268
-        p128_64 = _mm_slli_si128(p128, 8);
1269
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_slli_epi64(p128_64, 24), mask88));  /* [0..15]    -> [88..98]  */
1270
-    }
1271
-    if (N-p_idx > 9)
1272
-        a128 = _mm_or_si128(a128, _mm_and_si128(_mm_slli_epi64(p128_64, 19), mask99));  /* [16..31]   -> [99..109] */
1273
-    uint8_t a_last[16];
1274
-    _mm_storeu_si128((__m128i*)a_last, a128);
1275
-    memcpy(&a[a_idx], a_last, ((N-p_idx)*11+7)/8);
1276
-}
1277
-#endif   /* __SSSE3__ */
1278
-
1279
-void ntru_to_arr(NtruIntPoly *p, uint16_t q, uint8_t *a) {
1280
-#ifdef __SSSE3__
1281
-    if (q == 2048)
1282
-        ntru_to_arr_sse_2048(p, a);
1283
-    else
1284
-        ntru_to_arr_32(p, q, a);
1285
-#elif _LP64
1286
-    ntru_to_arr_64(p, q, a);
1287
-#else
1288
-    ntru_to_arr_32(p, q, a);
1289
-#endif
1290
-}
1291
-
1292 735
 void ntru_to_arr4(NtruIntPoly *p, uint8_t *arr) {
1293 736
     uint16_t i = 0;
1294 737
     while (i < p->N-3) {
@@ -1358,32 +801,6 @@ void ntru_mult_fac(NtruIntPoly *a, int16_t factor) {
1358 801
         a->coeffs[i] *= factor;
1359 802
 }
1360 803
 
1361
-#ifdef __SSSE3__
1362
-void ntru_mod_sse(NtruIntPoly *p, uint16_t mod_mask) {
1363
-    uint16_t i;
1364
-    __m128i mod_mask_128 = _mm_set1_epi16(mod_mask);
1365
-
1366
-    for (i=0; i<p->N; i+=8) {
1367
-        __m128i a = _mm_lddqu_si128((__m128i*)&p->coeffs[i]);
1368
-        a = _mm_and_si128(a, mod_mask_128);
1369
-        _mm_storeu_si128((__m128i*)&p->coeffs[i], a);
1370
-    }
1371
-}
1372
-#endif
1373
-
1374
-#ifdef __AVX2__
1375
-void ntru_mod_avx2(NtruIntPoly *p, uint16_t mod_mask) {
1376
-    uint16_t i;
1377
-    __m256i mod_mask_256 = _mm256_set1_epi16(mod_mask);
1378
-
1379
-    for (i=0; i<p->N; i+=16) {
1380
-        __m256i a = _mm256_lddqu_si256((__m256i*)&p->coeffs[i]);
1381
-        a = _mm256_and_si256(a, mod_mask_256);
1382
-        _mm256_storeu_si256((__m256i*)&p->coeffs[i], a);
1383
-    }
1384
-}
1385
-#endif   /* __AVX2__ */
1386
-
1387 804
 void ntru_mod_64(NtruIntPoly *p, uint16_t mod_mask) {
1388 805
     typedef uint64_t __attribute__((__may_alias__)) uint64_t_alias;
1389 806
     uint64_t mod_mask_64 = mod_mask;
@@ -1394,25 +811,13 @@ void ntru_mod_64(NtruIntPoly *p, uint16_t mod_mask) {
1394 811
         *((uint64_t_alias*)&p->coeffs[i]) &= mod_mask_64;
1395 812
 }
1396 813
 
1397
-void ntru_mod_32(NtruIntPoly *p, uint16_t modulus) {
814
+void ntru_mod_32(NtruIntPoly *p, uint16_t mod_mask) {
1398 815
     typedef uint32_t __attribute__((__may_alias__)) uint32_t_alias;
1399
-    uint32_t mod_mask = modulus - 1;
1400
-    mod_mask += mod_mask << 16;
816
+    uint32_t mod_mask_32 = mod_mask;
817
+    mod_mask_32 += mod_mask_32 << 16;
1401 818
     uint16_t i;
1402 819
     for (i=0; i<p->N; i+=2)
1403
-        *((uint32_t_alias*)&p->coeffs[i]) &= mod_mask;
1404
-}
1405
-
1406
-void ntru_mod_mask(NtruIntPoly *p, uint16_t mod_mask) {
1407
-#ifdef __AVX2__
1408
-    ntru_mod_avx2(p, mod_mask);
1409
-#elif __SSSE3__
1410
-    ntru_mod_sse(p, mod_mask);
1411
-#elif _LP64
1412
-    ntru_mod_64(p, mod_mask);
1413
-#else
1414
-    ntru_mod_32(p, mod_mask+1);
1415
-#endif
820
+        *((uint32_t_alias*)&p->coeffs[i]) &= mod_mask_32;
1416 821
 }
1417 822
 
1418 823
 void ntru_mod3_standard(NtruIntPoly *p) {
@@ -1427,109 +832,15 @@ void ntru_mod3_standard(NtruIntPoly *p) {
1427 832
     }
1428 833
 }
1429 834
 
1430
-#ifdef __SSSE3__
1431
-/* (i%3)+3 for i=0..7 */
1432
-__m128i NTRU_MOD3_LUT = {0x0403050403050403, 0};
1433
-
1434
-/**
1435
- * SSE version of ntru_mod3.
1436
- * Based on Douglas W Jones' mod3 function at
1437
- * http://homepage.cs.uiowa.edu/~jones/bcd/mod.shtml.
1438
- */
1439
-void ntru_mod3_sse(NtruIntPoly *p) {
1440
-    uint16_t i;
1441
-    for (i=0; i<(p->N+7)/8*8; i+=8) {
1442
-        __m128i a = _mm_lddqu_si128((__m128i*)&p->coeffs[i]);
1443
-
1444
-        /* make positive */
1445
-        __m128i _3000 = _mm_set1_epi16(3000);
1446
-        a = _mm_add_epi16(a, _3000);
1447
-
1448
-        /* a = (a>>8) + (a&0xFF);  (sum base 2**8 digits) */
1449
-        __m128i a1 = _mm_srli_epi16(a, 8);
1450
-        __m128i mask = _mm_set1_epi16(0x00FF);
1451
-        __m128i a2 = _mm_and_si128(a, mask);
1452
-        a = _mm_add_epi16(a1, a2);
1453
-
1454
-        /* a = (a>>4) + (a&0xF);  (sum base 2**4 digits; worst case 0x3B) */
1455
-        a1 = _mm_srli_epi16(a, 4);
1456
-        mask = _mm_set1_epi16(0x000F);
1457
-        a2 = _mm_and_si128(a, mask);
1458
-        a = _mm_add_epi16(a1, a2);
1459
-        /* a = (a>>2) + (a&0x3);  (sum base 2**2 digits; worst case 0x1B) */
1460
-        a1 = _mm_srli_epi16(a, 2);
1461
-        mask = _mm_set1_epi16(0x0003);
1462
-        a2 = _mm_and_si128(a, mask);
1463
-        a = _mm_add_epi16(a1, a2);
1464
-
1465
-        /* a = (a>>2) + (a&0x3);  (sum base 2**2 digits; worst case 0x7) */
1466
-        a1 = _mm_srli_epi16(a, 2);
1467
-        mask = _mm_set1_epi16(0x0003);
1468
-        a2 = _mm_and_si128(a, mask);
1469
-        a = _mm_add_epi16(a1, a2);
1470
-
1471
-        __m128i a_mod3 = _mm_shuffle_epi8(NTRU_MOD3_LUT, a);
1472
-        /* _mm_shuffle_epi8 changed bytes 1, 3, 5, ... to non-zero; change them back to zero */
1473
-        mask = _mm_set1_epi16(0x00FF);
1474
-        a_mod3 = _mm_and_si128(a_mod3, mask);
1475
-        /* subtract 3 so coefficients are in the 0..2 range */
1476
-        __m128i three = _mm_set1_epi16(0x0003);
1477
-        a_mod3 = _mm_sub_epi16(a_mod3, three);
1478
-
1479
-        _mm_storeu_si128((__m128i*)&p->coeffs[i], a_mod3);
1480
-    }
1481
-}
1482
-#endif   /* __SSSE3__ */
1483
-
1484
-#ifdef __AVX2__
1485
-__m256i NTRU_MOD3_LUT_AVX = {0x0403050403050403, 0, 0x0403050403050403, 0};
1486
-
1487
-void ntru_mod3_avx2(NtruIntPoly *p) {
1488
-    uint16_t i;
1489
-    for (i=0; i<(p->N+15)/16*16; i+=16) {
1490
-        __m256i a = _mm256_lddqu_si256((__m256i*)&p->coeffs[i]);
1491
-
1492
-        /* make positive */
1493
-        __m256i _3000 = _mm256_set1_epi16(3000);
1494
-        a = _mm256_add_epi16(a, _3000);
1495
-
1496
-        /* a = (a>>8) + (a&0xFF);  (sum base 2**8 digits) */
1497
-        __m256i a1 = _mm256_srli_epi16(a, 8);
1498
-        __m256i mask = _mm256_set1_epi16(0x00FF);
1499
-        __m256i a2 = _mm256_and_si256(a, mask);
1500
-        a = _mm256_add_epi16(a1, a2);
1501
-
1502
-        /* a = (a>>4) + (a&0xF);  (sum base 2**4 digits; worst case 0x3B) */
1503
-        a1 = _mm256_srli_epi16(a, 4);
1504
-        mask = _mm256_set1_epi16(0x000F);
1505
-        a2 = _mm256_and_si256(a, mask);
1506
-        a = _mm256_add_epi16(a1, a2);
1507
-        /* a = (a>>2) + (a&0x3);  (sum base 2**2 digits; worst case 0x1B) */
1508
-        a1 = _mm256_srli_epi16(a, 2);
1509
-        mask = _mm256_set1_epi16(0x0003);
1510
-        a2 = _mm256_and_si256(a, mask);
1511
-        a = _mm256_add_epi16(a1, a2);
1512
-
1513
-        /* a = (a>>2) + (a&0x3);  (sum base 2**2 digits; worst case 0x7) */
1514
-        a1 = _mm256_srli_epi16(a, 2);
1515
-        mask = _mm256_set1_epi16(0x0003);
1516
-        a2 = _mm256_and_si256(a, mask);
1517
-        a = _mm256_add_epi16(a1, a2);
1518
-
1519
-        __m256i a_mod3 = _mm256_shuffle_epi8(NTRU_MOD3_LUT_AVX, a);
1520
-        /* _mm256_shuffle_epi8 changed bytes 1, 3, 5, ... to non-zero; change them back to zero */
1521
-        mask = _mm256_set1_epi16(0x00FF);
1522
-        a_mod3 = _mm256_and_si256(a_mod3, mask);
1523
-        /* subtract 3 so coefficients are in the 0..2 range */
1524
-        __m256i three = _mm256_set1_epi16(0x0003);
1525
-        a_mod3 = _mm256_sub_epi16(a_mod3, three);
1526
-
1527
-        _mm256_storeu_si256((__m256i*)&p->coeffs[i], a_mod3);
1528
-    }
1529
-}
1530
-#endif   /* __AVX2__ */
1531
-
1532 835
 void ntru_mod3(NtruIntPoly *p) {
836
+#ifdef NTRU_DETECT_SIMD
837
+    if (__builtin_cpu_supports("avx2"))
838
+        return ntru_mod3_avx2(p);
839
+    else if (__builtin_cpu_supports("ssse3"))
840
+        return ntru_mod3_sse(p);
841
+    else
842
+        return ntru_mod3_standard(p);
843
+#else
1533 844
 #ifdef __AVX2__
1534 845
     ntru_mod3_avx2(p);
1535 846
 #elif __SSSE3__
@@ -1537,6 +848,7 @@ void ntru_mod3(NtruIntPoly *p) {
1537 848
 #else
1538 849
     ntru_mod3_standard(p);
1539 850
 #endif   /* __SSSE3__ */
851
+#endif   /* NTRU_DETECT_SIMD */
1540 852
 }
1541 853
 
1542 854
 void ntru_mod_center(NtruIntPoly *p, uint16_t modulus) {
@@ -1646,14 +958,6 @@ void ntru_lift_inverse(NtruPrivPoly *a, NtruIntPoly *Fq, uint16_t q) {
1646 958
     }
1647 959
 }
1648 960
 
1649
-uint8_t ntru_invert(NtruPrivPoly *a, uint16_t mod_mask, NtruIntPoly *Fq) {
1650
-#ifdef _LP64
1651
-    return ntru_invert_64(a, mod_mask, Fq);
1652
-#else
1653
-    return ntru_invert_32(a, mod_mask, Fq);
1654
-#endif
1655
-}
1656
-
1657 961
 uint8_t ntru_invert_32(NtruPrivPoly *a, uint16_t mod_mask, NtruIntPoly *Fq) {
1658 962
     int16_t i;
1659 963
 #ifndef NTRU_AVOID_HAMMING_WT_PATENT
@@ -1880,3 +1184,69 @@ uint8_t ntru_invert_64(NtruPrivPoly *a, uint16_t mod_mask, NtruIntPoly *Fq) {
1880 1184
 
1881 1185
     return 1;
1882 1186
 }
1187
+
1188
+uint8_t (*ntru_invert)(NtruPrivPoly *a, uint16_t mod_mask, NtruIntPoly *Fq);
1189
+
1190
+void ntru_set_optimized_impl_poly() {
1191
+#ifdef NTRU_DETECT_SIMD
1192
+    if (__builtin_cpu_supports("avx2")) {
1193
+        ntru_mult_int = ntru_mult_int_avx2;
1194
+        ntru_mult_tern = ntru_mult_tern_avx2;
1195
+        ntru_to_arr = ntru_to_arr_sse;
1196
+        ntru_mod_mask = ntru_mod_avx2;
1197
+    }
1198
+    else if (__builtin_cpu_supports("ssse3")) {
1199
+        ntru_mult_int = ntru_mult_int_sse;
1200
+        ntru_mult_tern = ntru_mult_tern_sse;
1201
+        ntru_to_arr = ntru_to_arr_sse;
1202
+        ntru_mod_mask = ntru_mod_sse;
1203
+    }
1204
+    else if (sizeof(void*) >= 8) {   /* 64-bit arch */
1205
+        ntru_mult_int = ntru_mult_int_64;
1206
+        ntru_mult_tern = ntru_mult_tern_64;
1207
+        ntru_to_arr = ntru_to_arr_64;
1208
+        ntru_mod_mask = ntru_mod_64;
1209
+    }
1210
+    else {
1211
+        ntru_mult_int = ntru_mult_int_16;
1212
+        ntru_mult_tern = ntru_mult_tern_32;
1213
+        ntru_to_arr = ntru_to_arr_32;
1214
+        ntru_mod_mask = ntru_mod_32;
1215
+    }
1216
+
1217
+    if (sizeof(void*) >= 8)   /* 64-bit arch */
1218
+        ntru_invert = ntru_invert_64;
1219
+    else
1220
+        ntru_invert = ntru_invert_32;
1221
+
1222
+#else
1223
+
1224
+#ifdef __AVX2__
1225
+    ntru_mult_int = ntru_mult_int_avx2;
1226
+    ntru_mult_tern = ntru_mult_tern_avx2;
1227
+    ntru_to_arr = ntru_to_arr_sse;
1228
+    ntru_mod_mask = ntru_mod_avx2;
1229
+#elif __SSSE3__
1230
+    ntru_mult_int = ntru_mult_int_sse;
1231
+    ntru_mult_tern = ntru_mult_tern_sse;
1232
+    ntru_to_arr = ntru_to_arr_sse;
1233
+    ntru_mod_mask = ntru_mod_sse;
1234
+#elif _LP64
1235
+    ntru_mult_int = ntru_mult_int_64;
1236
+    ntru_mult_tern = ntru_mult_tern_64;
1237
+    ntru_to_arr = ntru_to_arr_64;
1238
+    ntru_mod_mask = ntru_mod_64;
1239
+#else
1240
+    ntru_mult_int = ntru_mult_int_16;
1241
+    ntru_mult_tern = ntru_mult_tern_32;
1242
+    ntru_to_arr = ntru_to_arr_32;
1243
+    ntru_mod_mask = ntru_mod_32;
1244
+#endif
1245
+
1246
+#if _LP64
1247
+    ntru_invert = ntru_invert_64;
1248
+#else
1249
+    ntru_invert = ntru_invert_32;
1250
+#endif
1251
+#endif   /* NTRU_DETECT_SIMD */
1252
+}

+ 13 - 78
src/poly.h

@@ -71,7 +71,7 @@ void ntru_sub(NtruIntPoly *a, NtruIntPoly *b);
71 71
  * @param mod_mask an AND mask to apply; must be a power of two minus one
72 72
  * @return 0 if the number of coefficients differ, 1 otherwise
73 73
  */
74
-uint8_t ntru_mult_tern(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask);
74
+uint8_t (*ntru_mult_tern)(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask);
75 75
 
76 76
 /**
77 77
  * @brief General polynomial by ternary polynomial multiplication, 32 bit version
@@ -103,36 +103,6 @@ uint8_t ntru_mult_tern_32(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint1
103 103
  */
104 104
 uint8_t ntru_mult_tern_64(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask);
105 105
 
106
-/**
107
- * @brief General polynomial by ternary polynomial multiplication, SSSE3 version
108
- *
109
- * Multiplies a NtruIntPoly by a NtruTernPoly. The number of coefficients
110
- * must be the same for both polynomials.
111
- * This variant requires SSSE3 support.
112
- *
113
- * @param a a general polynomial
114
- * @param b a ternary polynomial
115
- * @param c output parameter; a pointer to store the new polynomial
116
- * @param mod_mask an AND mask to apply; must be a power of two minus one
117
- * @return 0 if the number of coefficients differ, 1 otherwise
118
- */
119
-uint8_t ntru_mult_tern_sse(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask);
120
-
121
-/**
122
- * @brief General polynomial by ternary polynomial multiplication, AVX2 version
123
- *
124
- * Multiplies a NtruIntPoly by a NtruTernPoly. The number of coefficients
125
- * must be the same for both polynomials.
126
- * This variant requires AVX2 support.
127
- *
128
- * @param a a general polynomial
129
- * @param b a ternary polynomial
130
- * @param c output parameter; a pointer to store the new polynomial
131
- * @param mod_mask an AND mask to apply; must be a power of two minus one
132
- * @return 0 if the number of coefficients differ, 1 otherwise
133
- */
134
-uint8_t ntru_mult_tern_avx2(NtruIntPoly *a, NtruTernPoly *b, NtruIntPoly *c, uint16_t mod_mask);
135
-
136 106
 #ifndef NTRU_AVOID_HAMMING_WT_PATENT
137 107
 /**
138 108
  * @brief General polynomial by product-form polynomial multiplication
@@ -195,19 +165,6 @@ void ntru_to_arr_64(NtruIntPoly *p, uint16_t q, uint8_t *a);
195 165
 /**
196 166
  * @brief Polynomial to binary
197 167
  *
198
- * Converts a NtruIntPoly to a uint8_t array. q is assumed to be 2048, so
199
- * each coefficient is encoded in 11 bits.
200
- * Requires SSSE3 support.
201
- *
202
- * @param p a polynomial
203
- * @param a output parameter; a pointer to store the encoded polynomial.
204
- *          Must accommodate at least 7 more bytes than the result takes up.
205
- */
206
-void ntru_to_arr_sse_2048(NtruIntPoly *p, uint8_t *a);
207
-
208
-/**
209
- * @brief Polynomial to binary
210
- *
211 168
  * Converts a NtruIntPoly to a uint8_t array. Each coefficient is encoded
212 169
  * in (log q) bits.
213 170
  *
@@ -215,7 +172,7 @@ void ntru_to_arr_sse_2048(NtruIntPoly *p, uint8_t *a);
215 172
  * @param q the modulus; must be a power of two
216 173
  * @param a output parameter; a pointer to store the encoded polynomial
217 174
  */
218
-void ntru_to_arr(NtruIntPoly *p, uint16_t q, uint8_t *a);
175
+void (*ntru_to_arr)(NtruIntPoly *p, uint16_t q, uint8_t *a);
219 176
 
220 177
 /**
221 178
  * @brief Polynomial to binary modulo 4
@@ -252,7 +209,7 @@ void ntru_mult_fac(NtruIntPoly *a, int16_t factor);
252 209
  * @param mod_mask an AND mask to apply to the coefficients of c
253 210
  * @return 0 if the number of coefficients differ, 1 otherwise
254 211
  */
255
-uint8_t ntru_mult_int(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask);
212
+uint8_t (*ntru_mult_int)(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask);
256 213
 
257 214
 /**
258 215
  * @brief Multiplication of two general polynomials with a modulus
@@ -285,36 +242,6 @@ uint8_t ntru_mult_int_16(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_
285 242
 uint8_t ntru_mult_int_64(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask);
286 243
 
287 244
 /**
288
- * @brief Multiplication of two general polynomials with a modulus, SSSE3 version
289
- *
290
- * Multiplies a NtruIntPoly by another, taking the coefficient values modulo an integer.
291
- * The number of coefficients must be the same for both polynomials.
292
- * Requires SSSE3 support.
293
- *
294
- * @param a input and output parameter; coefficients are overwritten
295
- * @param b a polynomial to multiply by
296
- * @param c output parameter; a pointer to store the new polynomial
297
- * @param mod_mask an AND mask to apply to the coefficients of c
298
- * @return 0 if the number of coefficients differ, 1 otherwise
299
- */
300
-uint8_t ntru_mult_int_sse(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask);
301
-
302
-/**
303
- * @brief Multiplication of two general polynomials with a modulus, AVX2 version
304
- *
305
- * Multiplies a NtruIntPoly by another, taking the coefficient values modulo an integer.
306
- * The number of coefficients must be the same for both polynomials.
307
- * Requires AVX2 support.
308
- *
309
- * @param a input and output parameter; coefficients are overwritten
310
- * @param b a polynomial to multiply by
311
- * @param c output parameter; a pointer to store the new polynomial
312
- * @param mod_mask an AND mask to apply to the coefficients of c
313
- * @return 0 if the number of coefficients differ, 1 otherwise
314
- */
315
-uint8_t ntru_mult_int_avx2(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask);
316
-
317
-/**
318 245
  * @brief Reduction modulo a power of two
319 246
  *
320 247
  * Reduces the coefficients of an NtruIntPoly modulo a power of two.
@@ -322,7 +249,7 @@ uint8_t ntru_mult_int_avx2(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint1
322 249
  * @param p input and output parameter; coefficients are overwritten
323 250
  * @param mod_mask an AND mask to apply to the coefficients of c
324 251
  */
325
-void ntru_mod_mask(NtruIntPoly *p, uint16_t mod_mask);
252
+void (*ntru_mod_mask)(NtruIntPoly *p, uint16_t mod_mask);
326 253
 
327 254
 /**
328 255
  * @brief Reduction modulo 3
@@ -398,7 +325,7 @@ void ntru_clear_int(NtruIntPoly *p);
398 325
  * @param Fq output parameter; a pointer to store the new polynomial
399 326
  * @return 1 if a is invertible, 0 otherwise
400 327
  */
401
-uint8_t ntru_invert(NtruPrivPoly *a, uint16_t mod_mask, NtruIntPoly *Fq);
328
+uint8_t (*ntru_invert)(NtruPrivPoly *a, uint16_t mod_mask, NtruIntPoly *Fq);
402 329
 
403 330
 /**
404 331
  * @brief Inverse modulo q
@@ -432,4 +359,12 @@ uint8_t ntru_invert_32(NtruPrivPoly *a, uint16_t mod_mask, NtruIntPoly *Fq);
432 359
  */
433 360
 uint8_t ntru_invert_64(NtruPrivPoly *a, uint16_t mod_mask, NtruIntPoly *Fq);
434 361
 
362
+/**
363
+ * @brief Choose fastest implementation
364
+ *
365
+ * Sets function pointers for polynomial math, etc. so the most efficient
366
+ * variant is used.
367
+ */
368
+void ntru_set_optimized_impl_poly();
369
+
435 370
 #endif   /* NTRU_POLY_H */

+ 293 - 0
src/poly_avx2.c

@@ -0,0 +1,293 @@
1
+#ifdef __AVX2__
2
+#include <string.h>
3
+#include <immintrin.h>
4
+#include "poly_avx2.h"
5
+#include "types.h"
6
+
7
+#define NTRU_SPARSE_THRESH_AVX2 14
8
+
9
+uint8_t ntru_mult_int_avx2(NtruIntPoly *a, NtruIntPoly *b, NtruIntPoly *c, uint16_t mod_mask) {
10
+    uint16_t N = a->N;
11
+    if (N != b->N)
12
+        return 0;
13
+    c->N = N;
14
+    int16_t c_coeffs[2*NTRU_INT_POLY_SIZE];   /* double capacity for intermediate result */
15
+    memset(&c_coeffs, 0, sizeof(c_coeffs));
16
+
17
+    uint16_t k;
18
+    for (k=N; k<NTRU_INT_POLY_SIZE; k++) {
19
+        a->coeffs[k] = 0;
20
+        b->coeffs[k] = 0;
21
+    }
22
+    for (k=0; k<N; k+=16) {
23
+        uint8_t j;
24
+
25
+        __m256i b256[8];
26
+        for (j=0; j<8; j++) {
27
+
28
+            b256[j] = _mm256_inserti128_si256(_mm256_castsi128_si256(
29
+                    _mm_set1_epi16(b->coeffs[k+j])),
30
+                    _mm_set1_epi16(b->coeffs[k+8+j]),1);
31
+        }
32
+
33
+        /* indices 0..7 */
34
+        __m128i tmp_a = _mm_lddqu_si128((__m128i*)&a->coeffs[0]);
35
+        __m256i a256 = _mm256_broadcastsi128_si256(tmp_a);
36
+
37
+        __m256i c256 = _mm256_lddqu_si256((__m256i*)&c_coeffs[k]);
38
+        for (j=0; j<8; j++) {
39
+            __m256i product = _mm256_mullo_epi16(a256, b256[j]);
40
+            c256 = _mm256_add_epi16(c256, product);
41
+            a256 = _mm256_bslli_epi128(a256, 2);
42
+        }
43
+        _mm256_storeu_si256((__m256i*)&c_coeffs[k], c256);
44
+
45
+        /* indices 8... */