-
Notifications
You must be signed in to change notification settings - Fork 856
/
aes.S
1698 lines (1601 loc) · 42.4 KB
/
aes.S
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
.syntax unified
.cpu cortex-m33
.thumb
#include "hardware/platform_defs.h"
#include "hardware/regs/addressmap.h"
#include "hardware/regs/sha256.h"
#include "hardware/rcp.h"
#include "config.h"
.global delay
.global aes_start
.global aes_end
.global flush_reg
.global isr_systick
.extern systick_data
.global gen_lut_inverse
.global gen_lut_sbox
.if NEED_INV_ROUNDS
.global gen_lut_inv_sbox
.endif
.if INCLUDE_ENCRYPT_CBC
.global cbc_encrypt_s
.endif
.if INCLUDE_DECRYPT_CBC
.global cbc_decrypt_s
.endif
.if INCLUDE_CRYPT_CTR
.global ctr_crypt_s
.endif
.global remap
.global gen_rand
.global init_key
.global rkey_s
.global lut_a,lut_a_map
.global lut_b,lut_b_map
.global rstate
@ RCP macros
#define CTAG0 0x2a
#define CTAG1 0x2b
#define CTAG2 0x2c
#define CTAG3 0x2d
#define CTAG4 0x2e
#define CTAG5 0x30
#define CTAG6 0x31
#define CTAG7 0x32
#define CTAG8 0x33
#define CTAG9 0x34
#define CTAG10 0x35
#define CTAG11 0x36
#define CTAG12 0x37
#define CTAG13 0x38
#define CTAG14 0x39
#define CTAG15 0x3a
#define CTAG16 0x3b
#define CTAG17 0x3c
.macro SET_COUNT n
.if RC_COUNT
.if RC_JITTER
rcp_count_set \n
.else
rcp_count_set_nodelay \n
.endif
.endif
.endm
.macro CHK_COUNT n
.if RC_COUNT
.if RC_JITTER
rcp_count_check \n
.else
rcp_count_check_nodelay \n
.endif
.endif
.endm
.macro GET_CANARY rx,tag
.if RC_CANARY
.if RC_JITTER
rcp_canary_get \rx,\tag
.else
rcp_canary_get_nodelay \rx,\tag
.endif
.endif
.endm
.macro CHK_CANARY rx,tag
.if RC_CANARY
.if RC_JITTER
rcp_canary_check \rx,\tag
.else
rcp_canary_check_nodelay \rx,\tag
.endif
.endif
.endm
.macro GET_CANARY_NJ rx,tag @ with no jitter even if you ask for it (otherwise slows down gen_rand a lot)
.if RC_CANARY
rcp_canary_get_nodelay \rx,\tag
.endif
.endm
.macro CHK_CANARY_NJ rx,tag @ with no jitter even if you ask for it
.if RC_CANARY
rcp_canary_check_nodelay \rx,\tag
.endif
.endm
.section .stack.aes
@ Regardless of configuration the code uses a single 256-entry LUT. If both
@ encryption and decryption are enabled then this is a table of inverses
@ of GF(2⁸) field elements, from which both the S-box and inverse S-box
@ functions can be derived; otherwise it can be a simple inverse S-box
@ table.
@ In either case the LUT is represented as two shares, lut_a and lut_b,
@ whose values must be EORed. Furthermore, the contents of each share are
@ scambled according to a 4-byte "map". The map comprises two bytes that
@ are EORed into the addressing of the share, and two bytes that are
@ EORed into the data read back from the share. Performing a lookup
@ of a value x involves computing
@ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ c₁ ^ lut_b[x ^ b₀ ^ b₁] ^ d₀ ^ d₁
@ where a₀, a₁, c₀ and c₁ are the "map" of the lut_a share and
@ b₀, b₁, d₀ and d₁ are the "map" of the lut_b share.
@ In practice the result of a lookup is itself represented in two
@ shares, namely
@ lut_a[x ^ a₀ ^ a₁] ^ c₀ ^ d₀ and
@ lut_b[x ^ b₀ ^ b₁] ^ c₁ ^ d₁
lut_a: @ LUT share A
.space 256
lut_a_map: @ the current scrambling of lut_a; not particularly secret since it can be deduced from the contents of lut_a and lut_b
.space 4
.space 4 @ align to multiple of 8
lut_b: @ LUT share B
.space 256
lut_b_map:
.space 4
.space 4 @ align to multiple of 8
rkey_s: @ round key shares
.if RK_ROR
.space 600
.else
.space 480
.endif
.if CT_BPERM
ctr_scratch: @ scratch area for CTR code to use when "decrypting" out-of-range blocks
.space 16
.endif
rstate: @ SHA random state, to be initialised to TRNG bytes; zeroth byte must be initialised to zero
.space 16
.section .text.aes,"ax",%progbits
.thumb_func
aes_start:
nop
.if GEN_RAND_SHA
.balign 4
.thumb_func
@ random numbers using SHA256 hardware
@ preserves r1-r3
gen_rand:
GET_CANARY_NJ r0,CTAG1
push {r0-r3,r14}
ldr r0,=#SHA256_BASE
4:
ldr r2,=#rstate
ldrb r1,[r2] @ get word counter from bottom byte of rstate[] (offset into SUM registers)
subs r3,r1,#4 @ decrement it to previous SUM register
ble 1f @ if the offset was 4 or less we have run out of SUM register values
.if SHA256_SUM0_OFFSET!=8
.err
.endif
2:
ldr r0,[r0,r1] @ read value from SUM register: note that this relies on SHA256_SUM0_OFFSET==8
strb r3,[r2] @ save updated SUM register offset in bottom byte of rstate[]
pop {r1}
CHK_CANARY_NJ r1,CTAG1
pop {r1-r3,r15}
1:
movs r3,#SHA256_SUM6_OFFSET+1
strb r3,[r2] @ reset word counter: the +1 is compensated for later
movw r1,#(1<<SHA256_CSR_BSWAP_LSB)+(1<<SHA256_CSR_START_LSB)
str r1,[r0,#SHA256_CSR_OFFSET] @ start SHA256 hardware
movs r3,#3 @ take four words from rstate, incrementing as we go
ldr r1,[r2]
adds r1,r1,#255 @ overall this adds 256 to the value in rstate and resets the bottom byte to SHA256_SUM6_OFFSET
1:
str r1,[r2],#4
str r1,[r0,#SHA256_WDATA_OFFSET]
cbz r3,3f
ldr r1,[r2]
adcs r1,r1,#0
sub r3,r3,#1 @ preserve the carry
b 1b
3:
ldr r1,=#1223352428 @ 12 more words with a fixed value
movs r3,#12
1:
str r1,[r0,#SHA256_WDATA_OFFSET]
subs r3,r3,#1
bne 1b
1:
ldr r3,[r0,#SHA256_CSR_OFFSET]
lsrs r3,r3,#SHA256_CSR_SUM_VLD_LSB+1
bcc 1b @ wait for hardware to finish
ldr r0,[r0,#SHA256_SUM7_OFFSET]
pop {r1}
CHK_CANARY_NJ r1,CTAG1
pop {r1-r3,r15}
.else
@ preserves r1-r3
.balign 4
.thumb_func
gen_rand:
GET_CANARY_NJ r0,CTAG1
push {r0,r1,r14}
ldr r14,=rstate
ldr r0,[r14]
ldr r1,=0x1d872b41 @ constant for a maximum-length sequence
and r1,r1,r0,asr#31 @ will we be shifting out a 1? keep the constant, otherwise 0
eor r0,r1,r0,lsl#1
str r0,[r14]
pop {r1}
CHK_CANARY_NJ r1,CTAG1
pop {r1,r15}
.endif
.ltorg
.balign 4
.thumb_func
gen_lut_inverse:
@ set lut_a to be a table of GF(2⁸) inverses, using lut_b as temporary storage
@ return r0=lut_a, r1=lut_b
ldr r0,=lut_a
ldr r1,=lut_b
@ first set lut_a to be a table of antilogarithms, lut_b a table of logarithms
mov r2,#0
strb r2,[r0] @ (*)
mov r3,#1 @ we maintain invariant that r2=log(r3)
1:
strb r2,[r0,r3] @ log table
strb r3,[r1,r2] @ antilog table
lsls r12,r3,#25
it cs
eorcs r12,r12,#0x1b000000 @ multiply by x
eor r3,r3,r12,lsr#24 @ multiply by x+1 ("3"), which is a primitive element
add r2,r2,#1
cmp r2,#255
bls 1b
movs r2,#255
1:
ldrb r3,[r0,r2] @ for each i≠0, find log,...
eor r3,r3,#255 @ ... negate...
ldrb r3,[r1,r3] @ ... and antilog to get inverse
strb r3,[r0,r2]
subs r2,r2,#1
bne 1b @ note that inverse(0)=0 by (*) above
bx r14
.balign 4
.thumb_func
remap:
@ do a random remap of the LUTs
@ preserves r0-r11
push {r14}
GET_CANARY r14,CTAG2
push {r0-r11,r14}
bl gen_rand
ldr r1,=lut_a
bl remap_1
bl gen_rand
ldr r1,=lut_b
bl remap_1
pop {r0-r11,r14}
CHK_CANARY r14,CTAG2
pop {r15}
remap_1:
@ r0: B0:xa B1:xb B2:ya B3:yb
@ r1: array of 256 bytes, followed by a 4-byte map
@ shuffle LUT share array such that new[i]=old[i^xa^xb]^ya^yb, update map according to r0
GET_CANARY r6,CTAG3
push {r6,r14}
mov r14,0x01010101
ubfx r6,r0,#16,#8
ubfx r7,r0,#24,#8
mul r6,r6,r14 @ data remaps ya and yb, byte replicated
mul r7,r7,r14
movw r10,#0x1010
and r10,r10,r0,lsl#3 @ 0/16 in each byte of r10 from b1 and b9 of r0, ready for rotates by 0 or 16
mov r3,#0x7f7f7f7f
ubfx r2,r0,#0,#1
lsl r11,r3,r2 @ 0x7f or 0xfe in each byte of r11, ready for sel of rev16
ubfx r2,r0,#8,#1
lsl r12,r3,r2
ldr r2,[r1,#0x100] @ old map
eors r2,r2,r0
str r2,[r1,#0x100] @ updated map
mov r2,#252 @ loop over entries
1:
ldr r4,[r1,r2]
eor r3,r2,r0
eor r3,r3,r0,ror#8
and r3,r3,#0xfc @ r3=remapped address r2
ldr r5,[r1,r3]
eors r5,r5,r6 @ remap data; ensure case x==0 works by doing both remaps on same side
eors r5,r5,r7
lsr r8,r10,#8
ror r5,r5,r8 @ ROR#16 is the same as eor of address with 2
ror r5,r5,r10
rev16 r8,r5 @ REV16 is the same as eor of address with 1
uadd8 r9,r11,r11
sel r5,r8,r5
rev16 r8,r5
uadd8 r9,r12,r12
sel r5,r8,r5
mul r8,r14,r2
mul r9,r14,r3
usub8 r8,r8,r9 @ bytewise comparison of original address and remapped address, both byte replicated
sel r8,r4,r5 @ swap r4 and r5 as necessary in constant time
str r8,[r1,r2] @ write possibly swapped values back
sel r8,r5,r4
str r8,[r1,r3]
subs r2,r2,#4
bpl 1b
pop {r6,r14}
CHK_CANARY r6,CTAG3
bx r14
.if NEED_HPERM
.balign 4
.thumb_func
hperm:
@ rotate state within registers
@ r0: B0: rotate amount for r4,r8; B1: rotate amount for r5,r9; B2: rotate amount for r6,r10; B3: rotate amount for r7,r11
@ return r0 value required to undo
movs r1,#0x18 @ constant for subsequent ANDs
and r2,r1,r0,lsl#3 @ extract amount
rors r4,r4,r2 @ rotate share A
rors r8,r8,r2 @ rotate share B
and r2,r1,r0,lsr#5 @ etc.
rors r5,r5,r2
rors r9,r9,r2
and r2,r1,r0,lsr#13
rors r6,r6,r2
rors r10,r10,r2
and r2,r1,r0,lsr#21
rors r7,r7,r2
rors r11,r11,r2
@ movs r1,#0 @ not needed as 0x18 has zeros in all the required places to do a two-bit-wise negate
usub8 r0,r1,r0
bx r14
.endif
.if NEED_VPERM
.balign 4
.thumb_func
vperm:
@ rotate state registers r4->r5-r6->r7->r4 etc. in constant time
@ r0: b0..1: rotate amount
@ returns r0 value required to undo
@ preserves r2
and r1,r0,#2
rsbs r1,r1,#0 @ 0 or fffffffe depending on b1 of r0
uadd8 r1,r1,r1 @ set/clear all GE flags according to b1 of r0: set if rotate of two places is required
mov r1,r4
sel r4,r6,r4
sel r6,r1,r6
mov r1,r5
sel r5,r7,r5
sel r7,r1,r7
mov r1,r8
sel r8,r10,r8
sel r10,r1,r10
mov r1,r9
sel r9,r11,r9
sel r11,r1,r11
and r1,r0,#1
rsbs r1,r1,#0 @ 0 or ffffffff depending on b0 of r0
uadd8 r1,r1,r1 @ set/clear all GE flags according to b0 of r0: set if rotate of one place is required
mov r1,r4
sel r4,r5,r4
sel r5,r6,r5
sel r6,r7,r6
sel r7,r1,r7
mov r1,r8
sel r8, r9 ,r8
sel r9, r10 ,r9
sel r10,r11,r10
sel r11,r1 ,r11
rsbs r0,r0,#0 @ generate control value for inverse operation
bx r14
.endif
.if IK_SHUFREAD
@ randomly shuffle an array n bytes long, n≤65536 a power of 2, by performing k random exchanges, k>0
@ r0: array pointer p
@ r1: n
@ r2: k
@ does not need to be a subroutine!!!
array_shuf:
push {r4-r6,r14}
mov r4,r0
subs r5,r1,#1 @ mask for random number generation
mov r6,r2
1:
bl gen_rand
and r1,r5,r0,lsr#16
and r0,r5,r0 @ r0,r1 are two random numbers 0..n-1
ldrb r2,[r4,r0]
ldrb r3,[r4,r1]
strb r3,[r4,r0]
strb r2,[r4,r1]
subs r6,r6,#1
bne 1b
pop {r4-r6,r15}
.endif
@ "refresh" shares of rkeys by random eor into both shares of each word
.if RK_ROR
@ and randomly change rotate amount on each word of each share
.endif
@ preserves r0-r11
.balign 4
ref_round_keys_s:
push {r14}
GET_CANARY r14,CTAG4
push {r0-r11,r14}
ldr r0,=rkey_s
mov r1,#15 @ there are 15 expanded keys
1:
.if RK_ROR
ldmia r0,{r2-r11}
push {r0-r1}
bl gen_rand @ xra=random extra rotates for share A
usub8 r6,r6,r0 @ ra-=xra bytewise
rors r2,r2,r0 @ a=ror(a,xra)
rev16 r0,r0 @ byte order 2301, i.e. B1 at the bottom
rors r3,r3,r0 @ a=ror(a,xra)
rev r0,r0 @ byte order 1032, i.e. B2 at the bottom
rors r4,r4,r0 @ a=ror(a,xra)
rev16 r0,r0 @ byte order 0123, i.e. B3 at the bottom
rors r5,r5,r0 @ a=ror(a,xra)
bl gen_rand @ xrb=random extra rotates for share B
usub8 r11,r11,r0 @ rb-=xrb bytewise
rors r7,r7,r0 @ b=ror(b,xrb)
rev16 r0,r0
rors r8,r8,r0 @ b=ror(b,xrb)
rev r0,r0
rors r9,r9,r0 @ b=ror(b,xrb)
rev16 r0,r0
rors r10,r10,r0 @ b=ror(b,xrb)
usub8 r1,r6,r11 @ ra-rb bytewise
bl gen_rand @ xab=extra exclusive OR into shares
eors r2,r2,r0 @ a^=xab
rors r0,r0,r1 @ ror(xab,ra-rb)
eors r7,r7,r0 @ b^=ror(xab,ra-rb)
rev16 r1,r1
bl gen_rand @ xab
eors r3,r3,r0 @ a^=xab
rors r0,r0,r1 @ ror(xab,ra-rb)
eors r8,r8,r0 @ b^=ror(xab,ra-rb)
rev r1,r1
bl gen_rand @ xab
eors r4,r4,r0 @ a^=xab
rors r0,r0,r1 @ ror(xab,ra-rb)
eors r9,r9,r0 @ b^=ror(xab,ra-rb)
rev16 r1,r1
bl gen_rand @ xab
eors r5,r5,r0 @ a^=xab
rors r0,r0,r1 @ ror(xab,ra-rb)
eors r10,r10,r0 @ b^=ror(xab,ra-rb)
pop {r0-r1}
stmia r0!,{r2-r11}
.else
ldmia r0,{r4-r11} @ EOR random data into the shares
push {r0-r1}
bl gen_rand
eor r4,r4,r0
eor r8,r8,r0
bl gen_rand
eor r5,r5,r0
eor r9,r9,r0
bl gen_rand
eor r6,r6,r0
eor r10,r10,r0
bl gen_rand
eor r7,r7,r0
eor r11,r11,r0
pop {r0-r1}
stmia r0!,{r4-r11}
.endif
subs r1,r1,#1
bne 1b
pop {r0-r11,r14}
CHK_CANARY r14,CTAG4
pop {r15}
@ switch from non-shared to shared state
.balign 4
ns_to_s:
push {r14}
GET_CANARY r14,CTAG5
push {r0-r3,r14}
bl gen_rand
mov r8,r0
bl gen_rand
mov r9,r0
bl gen_rand
mov r10,r0
bl gen_rand
mov r11,r0
eors r4,r4,r8
eors r5,r5,r9
eors r6,r6,r10
eors r7,r7,r11
pop {r0-r3,r14}
CHK_CANARY r14,CTAG5
pop {r15}
.if NEED_ROUNDS
.balign 4
.thumb_func
shift_rows_s:
@ first "rotate" the two most-significant bytes of the state by two registers
@ slightly faster (but not shorter?) with ubfx/bfi
eors r0,r4,r6 @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta;
lsrs r0,r0,#16
lsls r0,r0,#16
eors r4,r4,r0
eors r6,r6,r0
eors r0,r5,r7 @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta;
lsrs r0,r0,#16
lsls r0,r0,#16
eors r5,r5,r0
eors r7,r7,r0
@ next "rotate" the two odd-significance bytes of the state by one register
eors r1,r7,r4 @ tb=state[3]^state[0]; tb&=0xff00ff00;
ands r1,r1,#0xff00ff00
eors r0,r4,r5 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta;
ands r0,r0,#0xff00ff00
eors r4,r4,r0
eors r0,r5,r6 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta;
ands r0,r0,#0xff00ff00
eors r5,r5,r0
eors r0,r6,r7 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta;
ands r0,r0,#0xff00ff00
eors r6,r6,r0
eors r7,r7,r1 @ state[3]^=tb;
@ repeat for other share
eors r0,r8,r10 @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta;
lsrs r0,r0,#16
lsls r0,r0,#16
eors r8,r8,r0
eors r10,r10,r0
eors r0,r9,r11 @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta;
lsrs r0,r0,#16
lsls r0,r0,#16
eors r9,r9,r0
eors r11,r11,r0
eors r1,r11,r8 @ tb=state[3]^state[0]; tb&=0xff00ff00;
ands r1,r1,#0xff00ff00
eors r0,r8,r9 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[0]^=ta;
ands r0,r0,#0xff00ff00
eors r8,r8,r0
eors r0,r9,r10 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[1]^=ta;
ands r0,r0,#0xff00ff00
eors r9,r9,r0
eors r0,r10,r11 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[2]^=ta;
ands r0,r0,#0xff00ff00
eors r10,r10,r0
eors r11,r11,r1 @ state[3]^=tb;
bx r14
.endif
.if NEED_INV_ROUNDS
.balign 4
.thumb_func
inv_shift_rows_s:
@ first half is the same as shift_rows; halves could be done in opposite order for tail chain
eors r0,r4,r6 @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta;
lsrs r0,r0,#16
lsls r0,r0,#16
eors r4,r4,r0
eors r6,r6,r0
eors r0,r5,r7 @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta;
lsrs r0,r0,#16
lsls r0,r0,#16
eors r5,r5,r0
eors r7,r7,r0
eors r1,r7,r4 @ tb=state[3]^state[0]; tb&=0xff00ff00;
ands r1,r1,#0xff00ff00
eors r0,r6,r7 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[3]^=ta;
ands r0,r0,#0xff00ff00
eors r7,r7,r0
eors r0,r5,r6 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[2]^=ta;
ands r0,r0,#0xff00ff00
eors r6,r6,r0
eors r0,r4,r5 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[1]^=ta;
ands r0,r0,#0xff00ff00
eors r5,r5,r0
eors r4,r4,r1 @ state[0]^=tb;
eors r0,r8,r10 @ ta=state[0]^state[2]; ta&=0xffff0000; state[0]^=ta; state[2]^=ta;
lsrs r0,r0,#16
lsls r0,r0,#16
eors r8,r8,r0
eors r10,r10,r0
eors r0,r9,r11 @ ta=state[1]^state[3]; ta&=0xffff0000; state[1]^=ta; state[3]^=ta;
lsrs r0,r0,#16
lsls r0,r0,#16
eors r9,r9,r0
eors r11,r11,r0
eors r1,r11,r8 @ tb=state[3]^state[0]; tb&=0xff00ff00;
ands r1,r1,#0xff00ff00
eors r0,r10,r11 @ ta=state[2]^state[3]; ta&=0xff00ff00; state[3]^=ta;
ands r0,r0,#0xff00ff00
eors r11,r11,r0
eors r0,r9,r10 @ ta=state[1]^state[2]; ta&=0xff00ff00; state[2]^=ta;
ands r0,r0,#0xff00ff00
eors r10,r10,r0
eors r0,r8,r9 @ ta=state[0]^state[1]; ta&=0xff00ff00; state[1]^=ta;
ands r0,r0,#0xff00ff00
eors r9,r9,r0
eors r8,r8,r1 @ state[0]^=tb;
bx r14
.endif
@ multiply polynomial over GF(2⁸) by c(x) = 0x03x³ + 0x01x² + 0x01x + 0x02 modulo x⁴+1
@ r0x00 is a register holding 0x00000000; r0x1b is a register holding 0x1b1b1b1b
.macro mixcol rx,rt,ru,r0x00,r0x1b
@ let rx=(a,b,c,d)
uadd8 \rt,\rx,\rx @ MSB of each byte into the GE flags
sel \ru,\r0x1b,\r0x00 @ get bytewise correction for bytewise field multiplication by 2
eors \rt,\rt,\ru @ (2a,2b,2c,2d)
eors \ru,\rt,\rx @ (3a,3b,3c,3d)
eors \rt,\rt,\rx,ror#24 @ (2a+b,2b+c,2c+d,2d+a)
eors \rt,\rt,\rx,ror#16 @ (2a+b+c,2b+c+d,2c+d+a,2d+a+b)
eors \rx,\rt,\ru,ror#8 @ (2a+b+c+3d,2b+c+d+3a,2c+d+a+3b,2d+a+b+3c)
.endm
@ multiply polynomial over GF(2⁸) by d(x) = 0x0Bx³ + 0x0Dx² + 0x09x + 0x0E modulo x⁴+1; c(x)d(x)=1 modulo x⁴+1
.macro invmixcol rx,rt,ru,rv,rw,r0x00,r0x1b
@ !!! can probably save some registers, e.g. allow trashing of r0x00, r0x1b
@ can possibly also simplify slightly with refactorisation
uadd8 \rt,\rx,\rx @ field multiplication by 2 as above
sel \rw,\r0x1b,\r0x00
eors \rt,\rt,\rw @ 2x
uadd8 \ru,\rt,\rt
sel \rw,\r0x1b,\r0x00
eors \ru,\ru,\rw @ 4x
uadd8 \rv,\ru,\ru
sel \rw,\r0x1b,\r0x00
eors \rv,\rv,\rw @ 8x
eors \rx,\rx,\rv @ 9x
eors \rw,\rx,\rt @ 11x
eors \rw,\rw,\rx,ror#16 @ 11x ^ 9x ROL #16
eors \rx,\rx,\ru @ 13x
eors \rw,\rw,\rx,ror#8 @ 11x ^ 9x ROL #16 ^ 13x ROL #24
eors \rt,\rt,\ru @ 6x
eors \rt,\rt,\rv @ 14x
eors \rx,\rt,\rw,ror#8 @ 14x ^ 9x ROL #8 ^ 13x ROL #16 ^ 11x ROL #24
.endm
.if NEED_ROUNDS
.balign 4
.thumb_func
mix_cols_s:
mov r2,#0x00000000
mov r3,#0x1b1b1b1b
mixcol r4 ,r0,r1,r2,r3 @ apply mixcol to each state word
mixcol r5 ,r0,r1,r2,r3
mixcol r6 ,r0,r1,r2,r3
mixcol r7 ,r0,r1,r2,r3
mixcol r8 ,r0,r1,r2,r3
mixcol r9 ,r0,r1,r2,r3
mixcol r10,r0,r1,r2,r3
mixcol r11,r0,r1,r2,r3
bx r14
.endif
.if NEED_INV_ROUNDS
.balign 4
.thumb_func
inv_mix_cols_s:
push {r14}
GET_CANARY r14,CTAG6
push {r14}
mov r12,#0x00000000
mov r14,#0x1b1b1b1b
invmixcol r4 ,r0,r1,r2,r3,r12,r14 @ apply invmixcol to each state word
invmixcol r5 ,r0,r1,r2,r3,r12,r14
invmixcol r6 ,r0,r1,r2,r3,r12,r14
invmixcol r7 ,r0,r1,r2,r3,r12,r14
invmixcol r8 ,r0,r1,r2,r3,r12,r14
invmixcol r9 ,r0,r1,r2,r3,r12,r14
invmixcol r10,r0,r1,r2,r3,r12,r14
invmixcol r11,r0,r1,r2,r3,r12,r14
pop {r14}
CHK_CANARY r14,CTAG6
pop {r15}
.endif
.if SBOX_VIA_INV
@ bytewise EOR-convolution with constant 0x1f
.macro conv_0x1f rx,rt,ru
eors \rt,\rx,\rx,ror#31 @ t=x^ROL(x,1);
eors \rt,\rt,\rt,ror#30 @ t=t^ROL(t,2);
eors \rt,\rt,\rx,ror#28 @ t=t^ROL(x,4); @ convolution with byte boundaries "trashed"
ands \ru,\rx,#0xf0f0f0f0 @ u=x&0xf0f0f0f0;
eors \ru,\ru,\ru,ror#31 @ u=u^ROL(u,1);
eors \ru,\ru,\ru,ror#30 @ u=u^ROL(u,2);
ands \ru,\ru,#0x87878787 @ u=u&0x87878787; @ compensation for trashing
eors \ru,\ru,\ru,ror#24 @ u=u^ROL(u,8);
eors \rx,\rt,\ru,ror#7 @ t^=ROR(u,7); @ with trashing fixed
.endm
@ bytewise EOR-convolution with constant 0x4a
.macro conv_0x4a rx,rt,ru
eors \rt,\rx,\rx,ror#30 @ t=x^ROL(x,2);
eors \rt,\rt,\rx,ror#27 @ t=t^ROL(x,5);
ands \ru,\rx,#0xf8f8f8f8 @ u=x&0xf8f8f8f8;
eors \ru,\ru,\ru,ror#29 @ u=u^ROL(u,3);
ands \ru,\ru,#0xc7c7c7c7 @ u=u&0xc7c7c7c7;
eors \ru,\ru,\ru,ror#24 @ u=u^ROL(u,8);
eors \rt,\rt,\ru,ror#6 @ t^=ROR(u,6);
ands \ru,\rt,#0x80808080 @ t=rorbytes(t,7);
uadd8 \rt,\rt,\rt
orrs \rx,\rt,\ru,lsr#7
.endm
.balign 4
.thumb_func
map_sbox_s:
push {r14}
GET_CANARY r14,CTAG7
push {r14}
bl lutmap_state_s @ the S-box function is an inverse followed by an affine transformation:
conv_0x1f r4 ,r0,r1 @ see https://en.wikipedia.org/wiki/Rijndael_S-box
conv_0x1f r5 ,r0,r1
conv_0x1f r6 ,r0,r1
conv_0x1f r7 ,r0,r1
conv_0x1f r8 ,r0,r1
conv_0x1f r9 ,r0,r1
conv_0x1f r10,r0,r1
conv_0x1f r11,r0,r1
eor r4 ,r4 ,#0xcacacaca @ scramble the shares slightly: 0x63=0xca^0xa9 etc.
eor r5 ,r5 ,#0xf5f5f5f5
eor r6 ,r6 ,#0x0c0c0c0c
eor r7 ,r7 ,#0xa2a2a2a2
eor r8 ,r8 ,#0xa9a9a9a9
eor r9 ,r9 ,#0x96969696
eor r10,r10,#0x6f6f6f6f
eor r11,r11,#0xc1c1c1c1
pop {r14}
CHK_CANARY r14,CTAG7
pop {r15}
.if NEED_INV_ROUNDS
.balign 4
.thumb_func
inv_map_sbox_s:
push {r14}
GET_CANARY r14,CTAG8
push {r14} @ similarly, the inverse S-box is an affine transformation followed by an inverse
conv_0x4a r4 ,r0,r1
conv_0x4a r5 ,r0,r1
conv_0x4a r6 ,r0,r1
conv_0x4a r7 ,r0,r1
conv_0x4a r8 ,r0,r1
conv_0x4a r9 ,r0,r1
conv_0x4a r10,r0,r1
conv_0x4a r11,r0,r1
eor r4 ,r4 ,#0xd1d1d1d1 @ scramble the shares slightly: 0x05=0xd1^0xd4 etc.
eor r5 ,r5 ,#0x94949494
eor r6 ,r6 ,#0xfcfcfcfc
eor r7 ,r7 ,#0x3a3a3a3a
eor r8 ,r8 ,#0xd4d4d4d4
eor r9 ,r9 ,#0x91919191
eor r10,r10,#0xf9f9f9f9
eor r11,r11,#0x3f3f3f3f
bl lutmap_state_s
pop {r14}
CHK_CANARY r14,CTAG8
pop {r15}
.endif
.else
.balign 4
.thumb_func
gen_lut_sbox:
@ set both lut_a and lut_b to the S-box table
@ returns r0=lut_a+256, r1=lut_b+256
push {r14}
GET_CANARY r14,CTAG9
push {r14} @ similarly, the inverse S-box is an affine transformation followed by an inverse
bl gen_lut_inverse @ first generate the table of inverses in lut_a
mov r14,#256
1:
ldrb r2,[r0]
eors r3,r2,r2,lsl#1 @ convolve byte with 0x1f
eors r3,r3,r3,lsl#2
eors r3,r3,r2,lsl#4
eors r2,r3,r3,lsr#8
eor r2,r2,#0x63 @ and add 0x63
strb r2,[r0],#1
strb r2,[r1],#1
subs r14,r14,#1
bne 1b
pop {r14}
CHK_CANARY r14,CTAG9
pop {r15}
.if NEED_INV_ROUNDS
.balign 4
.thumb_func
gen_lut_inv_sbox:
@ set lut_a to the inverse S-box table
push {r14}
GET_CANARY r14,CTAG10
push {r14}
bl gen_lut_sbox @ get the forwards S-box
sub r0,r0,#256
sub r1,r1,#256
mov r2,#0
1:
ldrb r3,[r1],#1 @ get y=S-box(x)...
strb r2,[r0,r3] @ ... and store x at location y
adds r2,r2,#1
cmp r2,#255
bls 1b
pop {r14}
CHK_CANARY r14,CTAG10
pop {r15}
.endif
.endif
@ if we are using direct S-box lookup then [inv_]map_sbox_s is the same as lutmap_state_s
.if !SBOX_VIA_INV
.balign 4
.thumb_func
map_sbox_s:
.if NEED_INV_ROUNDS
.thumb_func
inv_map_sbox_s:
.endif
.endif
@ map all bytes of the state through the LUT
.balign 4
lutmap_state_s:
push {r14}
GET_CANARY r14,CTAG11
push {r14}
ldr r12,=lut_a
ldr r14,=lut_b
mov r0,#0x8000 @ "counter" for bytes of state mapped
1:
ldr r3,[r12,#0x100] @ lut_a_map
eor r1,r4,r3 @ share A of x ^ share A of lut_a address map
eor r1,r1,r8 @ ^ share B of x
eor r1,r1,r3,ror#8 @ ^ share B of lut_a address map
uxtb r1,r1
ldrb r1,[r12,r1] @ look up in lut_a
eor r1,r1,r3,ror#16 @ ^ share A of lut_a data map
ldr r3,[r14,#0x100] @ lut_b_map
eor r1,r1,r3,ror#24 @ ^ share B of lut_b data map, generating share A of the result
eor r2,r4,r3 @ share A of x ^ share A of lut_b address map
eor r2,r2,r8 @ ^ share B of x
eor r2,r2,r3,ror#8 @ ^ share B of lut_b address map
uxtb r2,r2
ldrb r2,[r14,r2] @ look up in lut_b
eor r2,r2,r3,ror#16 @ ^ share A of lut_b data map
ldr r3,[r12,#0x100] @ lut_a_map
eor r2,r2,r3,ror#24 @ ^ share B of lut_a data map, generating share B of the result
lsrs r4,#8 @ shift share A of state down one byte...
orrs r4,r4,r5,lsl#24
lsrs r5,#8
orrs r5,r5,r6,lsl#24
lsrs r6,#8
orrs r6,r6,r7,lsl#24
lsrs r7,#8
orrs r7,r7,r1,lsl#24 @ and insert share A of mapped byte
lsrs r8,#8 @ shift share B of state down one byte...
orrs r8,r8,r9,lsl#24
lsrs r9,#8
orrs r9,r9,r10,lsl#24
lsrs r10,#8
orrs r10,r10,r11,lsl#24
lsrs r11,#8
orrs r11,r11,r2,lsl#24 @ and insert share B of mapped byte
lsrs r0,#1 @ count 16 iterations
bne 1b
pop {r14}
CHK_CANARY r14,CTAG11
pop {r15}
@ perform one EOR step in round key generation
@ !!! can we introduce some more randomness into the shares here?
.balign 4
grk_s_step:
ldmia r0!,{r5-r7,r12} @ from last round key_a but one
eors r5,r5,r4
eors r6,r6,r5
eors r7,r7,r6
eors r12,r12,r7
stmia r1!,{r5-r7,r12}
mov r4,r12
.if RK_ROR
movs r12,#0
str r12,[r0],#4
str r12,[r1],#4
.endif
ldmia r0!,{r9-r11,r12} @ from last round key_a but one
eors r9,r9,r8
eors r10,r10,r9
eors r11,r11,r10
eors r12,r12,r11
stmia r1!,{r9-r11,r12}
mov r8,r12
.if RK_ROR
movs r12,#0
str r12,[r0],#4
str r12,[r1],#4
.endif
bx r14
.macro jitter rx
.if IK_JITTER
rors \rx,\rx,#1
bcc \@f
\@:
.else
@ nothing
.endif
.endm
.balign 4
.thumb_func
init_key:
@ r0: rkeys_s
@ r1: raw key data (32 bytes)
.if RK_ROR
@ rkeys_s is a 40*15=600-byte region
@ each of the 15 round keys is represented as two 4-word regions rka[0..3] and rkb[0..3], each of which is followed by a word containing
@ four byte-wide rotate values ra[i] and rb[i]
@ such that rk[i]=(rka[i] ROR ra[i])^(rkb[i] ROR rb[i]) gives the round keys
@ rotations always operate mod 32, so we do not bother to mask the rotate amounts to 5 bits
.else
@ rkeys_s is a 32*15=480-byte region
@ each of the 15 round keys is represented as two 4-word regions rka[0..3] and rkb[0..3]
@ such that rk[i]=rka[i]^rkb[i] gives the round keys
.endif
GET_CANARY r12,CTAG12
push {r4-r12,r14}
.if IK_JITTER
push {r0,r1}
bl gen_rand
mov r12,r0
pop {r0,r1}
.endif
jitter r12
mov r4,r0
mov r5,r1
.if IK_SHUFREAD
SET_COUNT 73
add r6,r4,#128 @ use 64 bytes of temporary space at r0+128 for buf
mov r7,#0
1:
bl gen_rand