openssl/crypto/aes/asm/aes-sparcv9.pl
Matt Caswell 54b4053130 Update copyright year
Reviewed-by: Richard Levitte <levitte@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/16176)
2021-07-29 15:41:35 +01:00

1195 lines
30 KiB
Raku
Executable File

#! /usr/bin/env perl
# Copyright 2005-2021 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the License.
# ====================================================================
#
# Version 1.1
#
# The major reason for undertaken effort was to mitigate the hazard of
# cache-timing attack. This is [currently and initially!] addressed in
# two ways. 1. S-boxes are compressed from 5KB to 2KB+256B size each.
# 2. References to them are scheduled for L2 cache latency, meaning
# that the tables don't have to reside in L1 cache. Once again, this
# is an initial draft and one should expect more countermeasures to
# be implemented...
#
# Version 1.1 prefetches T[ed]4 in order to mitigate attack on last
# round.
#
# Even though performance was not the primary goal [on the contrary,
# extra shifts "induced" by compressed S-box and longer loop epilogue
# "induced" by scheduling for L2 have negative effect on performance],
# the code turned out to run in ~23 cycles per processed byte en-/
# decrypted with 128-bit key. This is pretty good result for code
# with mentioned qualities and UltraSPARC core. Compared to Sun C
# generated code my encrypt procedure runs just few percents faster,
# while decrypt one - whole 50% faster [yes, Sun C failed to generate
# optimal decrypt procedure]. Compared to GNU C generated code both
# procedures are more than 60% faster:-)
$output = pop and open STDOUT,">$output";
$frame="STACK_FRAME";
$bias="STACK_BIAS";
$locals=16;
$acc0="%l0";
$acc1="%o0";
$acc2="%o1";
$acc3="%o2";
$acc4="%l1";
$acc5="%o3";
$acc6="%o4";
$acc7="%o5";
$acc8="%l2";
$acc9="%o7";
$acc10="%g1";
$acc11="%g2";
$acc12="%l3";
$acc13="%g3";
$acc14="%g4";
$acc15="%g5";
$t0="%l4";
$t1="%l5";
$t2="%l6";
$t3="%l7";
$s0="%i0";
$s1="%i1";
$s2="%i2";
$s3="%i3";
$tbl="%i4";
$key="%i5";
$rounds="%i7"; # aliases with return address, which is off-loaded to stack
sub _data_word()
{ my $i;
while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
}
$code.=<<___;
#ifndef __ASSEMBLER__
# define __ASSEMBLER__ 1
#endif
#include "crypto/sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
.align 256
AES_Te:
___
&_data_word(
0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
$code.=<<___;
.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
.type AES_Te,#object
.size AES_Te,(.-AES_Te)
.align 64
.skip 16
_sparcv9_AES_encrypt:
save %sp,-$frame-$locals,%sp
stx %i7,[%sp+$bias+$frame+0] ! off-load return address
ld [$key+240],$rounds
ld [$key+0],$t0
ld [$key+4],$t1 !
ld [$key+8],$t2
srl $rounds,1,$rounds
xor $t0,$s0,$s0
ld [$key+12],$t3
srl $s0,21,$acc0
xor $t1,$s1,$s1
ld [$key+16],$t0
srl $s1,13,$acc1 !
xor $t2,$s2,$s2
ld [$key+20],$t1
xor $t3,$s3,$s3
ld [$key+24],$t2
and $acc0,2040,$acc0
ld [$key+28],$t3
nop
.Lenc_loop:
srl $s2,5,$acc2 !
and $acc1,2040,$acc1
ldx [$tbl+$acc0],$acc0
sll $s3,3,$acc3
and $acc2,2040,$acc2
ldx [$tbl+$acc1],$acc1
srl $s1,21,$acc4
and $acc3,2040,$acc3
ldx [$tbl+$acc2],$acc2 !
srl $s2,13,$acc5
and $acc4,2040,$acc4
ldx [$tbl+$acc3],$acc3
srl $s3,5,$acc6
and $acc5,2040,$acc5
ldx [$tbl+$acc4],$acc4
fmovs %f0,%f0
sll $s0,3,$acc7 !
and $acc6,2040,$acc6
ldx [$tbl+$acc5],$acc5
srl $s2,21,$acc8
and $acc7,2040,$acc7
ldx [$tbl+$acc6],$acc6
srl $s3,13,$acc9
and $acc8,2040,$acc8
ldx [$tbl+$acc7],$acc7 !
srl $s0,5,$acc10
and $acc9,2040,$acc9
ldx [$tbl+$acc8],$acc8
sll $s1,3,$acc11
and $acc10,2040,$acc10
ldx [$tbl+$acc9],$acc9
fmovs %f0,%f0
srl $s3,21,$acc12 !
and $acc11,2040,$acc11
ldx [$tbl+$acc10],$acc10
srl $s0,13,$acc13
and $acc12,2040,$acc12
ldx [$tbl+$acc11],$acc11
srl $s1,5,$acc14
and $acc13,2040,$acc13
ldx [$tbl+$acc12],$acc12 !
sll $s2,3,$acc15
and $acc14,2040,$acc14
ldx [$tbl+$acc13],$acc13
and $acc15,2040,$acc15
add $key,32,$key
ldx [$tbl+$acc14],$acc14
fmovs %f0,%f0
subcc $rounds,1,$rounds !
ldx [$tbl+$acc15],$acc15
bz,a,pn %icc,.Lenc_last
add $tbl,2048,$rounds
srlx $acc1,8,$acc1
xor $acc0,$t0,$t0
ld [$key+0],$s0
fmovs %f0,%f0
srlx $acc2,16,$acc2 !
xor $acc1,$t0,$t0
ld [$key+4],$s1
srlx $acc3,24,$acc3
xor $acc2,$t0,$t0
ld [$key+8],$s2
srlx $acc5,8,$acc5
xor $acc3,$t0,$t0
ld [$key+12],$s3 !
srlx $acc6,16,$acc6
xor $acc4,$t1,$t1
fmovs %f0,%f0
srlx $acc7,24,$acc7
xor $acc5,$t1,$t1
srlx $acc9,8,$acc9
xor $acc6,$t1,$t1
srlx $acc10,16,$acc10 !
xor $acc7,$t1,$t1
srlx $acc11,24,$acc11
xor $acc8,$t2,$t2
srlx $acc13,8,$acc13
xor $acc9,$t2,$t2
srlx $acc14,16,$acc14
xor $acc10,$t2,$t2
srlx $acc15,24,$acc15 !
xor $acc11,$t2,$t2
xor $acc12,$acc14,$acc14
xor $acc13,$t3,$t3
srl $t0,21,$acc0
xor $acc14,$t3,$t3
srl $t1,13,$acc1
xor $acc15,$t3,$t3
and $acc0,2040,$acc0 !
srl $t2,5,$acc2
and $acc1,2040,$acc1
ldx [$tbl+$acc0],$acc0
sll $t3,3,$acc3
and $acc2,2040,$acc2
ldx [$tbl+$acc1],$acc1
fmovs %f0,%f0
srl $t1,21,$acc4 !
and $acc3,2040,$acc3
ldx [$tbl+$acc2],$acc2
srl $t2,13,$acc5
and $acc4,2040,$acc4
ldx [$tbl+$acc3],$acc3
srl $t3,5,$acc6
and $acc5,2040,$acc5
ldx [$tbl+$acc4],$acc4 !
sll $t0,3,$acc7
and $acc6,2040,$acc6
ldx [$tbl+$acc5],$acc5
srl $t2,21,$acc8
and $acc7,2040,$acc7
ldx [$tbl+$acc6],$acc6
fmovs %f0,%f0
srl $t3,13,$acc9 !
and $acc8,2040,$acc8
ldx [$tbl+$acc7],$acc7
srl $t0,5,$acc10
and $acc9,2040,$acc9
ldx [$tbl+$acc8],$acc8
sll $t1,3,$acc11
and $acc10,2040,$acc10
ldx [$tbl+$acc9],$acc9 !
srl $t3,21,$acc12
and $acc11,2040,$acc11
ldx [$tbl+$acc10],$acc10
srl $t0,13,$acc13
and $acc12,2040,$acc12
ldx [$tbl+$acc11],$acc11
fmovs %f0,%f0
srl $t1,5,$acc14 !
and $acc13,2040,$acc13
ldx [$tbl+$acc12],$acc12
sll $t2,3,$acc15
and $acc14,2040,$acc14
ldx [$tbl+$acc13],$acc13
srlx $acc1,8,$acc1
and $acc15,2040,$acc15
ldx [$tbl+$acc14],$acc14 !
srlx $acc2,16,$acc2
xor $acc0,$s0,$s0
ldx [$tbl+$acc15],$acc15
srlx $acc3,24,$acc3
xor $acc1,$s0,$s0
ld [$key+16],$t0
fmovs %f0,%f0
srlx $acc5,8,$acc5 !
xor $acc2,$s0,$s0
ld [$key+20],$t1
srlx $acc6,16,$acc6
xor $acc3,$s0,$s0
ld [$key+24],$t2
srlx $acc7,24,$acc7
xor $acc4,$s1,$s1
ld [$key+28],$t3 !
srlx $acc9,8,$acc9
xor $acc5,$s1,$s1
ldx [$tbl+2048+0],%g0 ! prefetch te4
srlx $acc10,16,$acc10
xor $acc6,$s1,$s1
ldx [$tbl+2048+32],%g0 ! prefetch te4
srlx $acc11,24,$acc11
xor $acc7,$s1,$s1
ldx [$tbl+2048+64],%g0 ! prefetch te4
srlx $acc13,8,$acc13
xor $acc8,$s2,$s2
ldx [$tbl+2048+96],%g0 ! prefetch te4
srlx $acc14,16,$acc14 !
xor $acc9,$s2,$s2
ldx [$tbl+2048+128],%g0 ! prefetch te4
srlx $acc15,24,$acc15
xor $acc10,$s2,$s2
ldx [$tbl+2048+160],%g0 ! prefetch te4
srl $s0,21,$acc0
xor $acc11,$s2,$s2
ldx [$tbl+2048+192],%g0 ! prefetch te4
xor $acc12,$acc14,$acc14
xor $acc13,$s3,$s3
ldx [$tbl+2048+224],%g0 ! prefetch te4
srl $s1,13,$acc1 !
xor $acc14,$s3,$s3
xor $acc15,$s3,$s3
ba .Lenc_loop
and $acc0,2040,$acc0
.align 32
.Lenc_last:
srlx $acc1,8,$acc1 !
xor $acc0,$t0,$t0
ld [$key+0],$s0
srlx $acc2,16,$acc2
xor $acc1,$t0,$t0
ld [$key+4],$s1
srlx $acc3,24,$acc3
xor $acc2,$t0,$t0
ld [$key+8],$s2 !
srlx $acc5,8,$acc5
xor $acc3,$t0,$t0
ld [$key+12],$s3
srlx $acc6,16,$acc6
xor $acc4,$t1,$t1
srlx $acc7,24,$acc7
xor $acc5,$t1,$t1
srlx $acc9,8,$acc9 !
xor $acc6,$t1,$t1
srlx $acc10,16,$acc10
xor $acc7,$t1,$t1
srlx $acc11,24,$acc11
xor $acc8,$t2,$t2
srlx $acc13,8,$acc13
xor $acc9,$t2,$t2
srlx $acc14,16,$acc14 !
xor $acc10,$t2,$t2
srlx $acc15,24,$acc15
xor $acc11,$t2,$t2
xor $acc12,$acc14,$acc14
xor $acc13,$t3,$t3
srl $t0,24,$acc0
xor $acc14,$t3,$t3
srl $t1,16,$acc1 !
xor $acc15,$t3,$t3
srl $t2,8,$acc2
and $acc1,255,$acc1
ldub [$rounds+$acc0],$acc0
srl $t1,24,$acc4
and $acc2,255,$acc2
ldub [$rounds+$acc1],$acc1
srl $t2,16,$acc5 !
and $t3,255,$acc3
ldub [$rounds+$acc2],$acc2
ldub [$rounds+$acc3],$acc3
srl $t3,8,$acc6
and $acc5,255,$acc5
ldub [$rounds+$acc4],$acc4
fmovs %f0,%f0
srl $t2,24,$acc8 !
and $acc6,255,$acc6
ldub [$rounds+$acc5],$acc5
srl $t3,16,$acc9
and $t0,255,$acc7
ldub [$rounds+$acc6],$acc6
ldub [$rounds+$acc7],$acc7
fmovs %f0,%f0
srl $t0,8,$acc10 !
and $acc9,255,$acc9
ldub [$rounds+$acc8],$acc8
srl $t3,24,$acc12
and $acc10,255,$acc10
ldub [$rounds+$acc9],$acc9
srl $t0,16,$acc13
and $t1,255,$acc11
ldub [$rounds+$acc10],$acc10 !
srl $t1,8,$acc14
and $acc13,255,$acc13
ldub [$rounds+$acc11],$acc11
ldub [$rounds+$acc12],$acc12
and $acc14,255,$acc14
ldub [$rounds+$acc13],$acc13
and $t2,255,$acc15
ldub [$rounds+$acc14],$acc14 !
sll $acc0,24,$acc0
xor $acc3,$s0,$s0
ldub [$rounds+$acc15],$acc15
sll $acc1,16,$acc1
xor $acc0,$s0,$s0
ldx [%sp+$bias+$frame+0],%i7 ! restore return address
fmovs %f0,%f0
sll $acc2,8,$acc2 !
xor $acc1,$s0,$s0
sll $acc4,24,$acc4
xor $acc2,$s0,$s0
sll $acc5,16,$acc5
xor $acc7,$s1,$s1
sll $acc6,8,$acc6
xor $acc4,$s1,$s1
sll $acc8,24,$acc8 !
xor $acc5,$s1,$s1
sll $acc9,16,$acc9
xor $acc11,$s2,$s2
sll $acc10,8,$acc10
xor $acc6,$s1,$s1
sll $acc12,24,$acc12
xor $acc8,$s2,$s2
sll $acc13,16,$acc13 !
xor $acc9,$s2,$s2
sll $acc14,8,$acc14
xor $acc10,$s2,$s2
xor $acc12,$acc14,$acc14
xor $acc13,$s3,$s3
xor $acc14,$s3,$s3
xor $acc15,$s3,$s3
ret
restore
.type _sparcv9_AES_encrypt,#function
.size _sparcv9_AES_encrypt,(.-_sparcv9_AES_encrypt)
.align 32
.globl AES_encrypt
AES_encrypt:
or %o0,%o1,%g1
andcc %g1,3,%g0
bnz,pn %xcc,.Lunaligned_enc
save %sp,-$frame,%sp
ld [%i0+0],%o0
ld [%i0+4],%o1
ld [%i0+8],%o2
ld [%i0+12],%o3
1: call .+8
add %o7,AES_Te-1b,%o4
call _sparcv9_AES_encrypt
mov %i2,%o5
st %o0,[%i1+0]
st %o1,[%i1+4]
st %o2,[%i1+8]
st %o3,[%i1+12]
ret
restore
.align 32
.Lunaligned_enc:
ldub [%i0+0],%l0
ldub [%i0+1],%l1
ldub [%i0+2],%l2
sll %l0,24,%l0
ldub [%i0+3],%l3
sll %l1,16,%l1
ldub [%i0+4],%l4
sll %l2,8,%l2
or %l1,%l0,%l0
ldub [%i0+5],%l5
sll %l4,24,%l4
or %l3,%l2,%l2
ldub [%i0+6],%l6
sll %l5,16,%l5
or %l0,%l2,%o0
ldub [%i0+7],%l7
sll %l6,8,%l6
or %l5,%l4,%l4
ldub [%i0+8],%l0
or %l7,%l6,%l6
ldub [%i0+9],%l1
or %l4,%l6,%o1
ldub [%i0+10],%l2
sll %l0,24,%l0
ldub [%i0+11],%l3
sll %l1,16,%l1
ldub [%i0+12],%l4
sll %l2,8,%l2
or %l1,%l0,%l0
ldub [%i0+13],%l5
sll %l4,24,%l4
or %l3,%l2,%l2
ldub [%i0+14],%l6
sll %l5,16,%l5
or %l0,%l2,%o2
ldub [%i0+15],%l7
sll %l6,8,%l6
or %l5,%l4,%l4
or %l7,%l6,%l6
or %l4,%l6,%o3
1: call .+8
add %o7,AES_Te-1b,%o4
call _sparcv9_AES_encrypt
mov %i2,%o5
srl %o0,24,%l0
srl %o0,16,%l1
stb %l0,[%i1+0]
srl %o0,8,%l2
stb %l1,[%i1+1]
stb %l2,[%i1+2]
srl %o1,24,%l4
stb %o0,[%i1+3]
srl %o1,16,%l5
stb %l4,[%i1+4]
srl %o1,8,%l6
stb %l5,[%i1+5]
stb %l6,[%i1+6]
srl %o2,24,%l0
stb %o1,[%i1+7]
srl %o2,16,%l1
stb %l0,[%i1+8]
srl %o2,8,%l2
stb %l1,[%i1+9]
stb %l2,[%i1+10]
srl %o3,24,%l4
stb %o2,[%i1+11]
srl %o3,16,%l5
stb %l4,[%i1+12]
srl %o3,8,%l6
stb %l5,[%i1+13]
stb %l6,[%i1+14]
stb %o3,[%i1+15]
ret
restore
.type AES_encrypt,#function
.size AES_encrypt,(.-AES_encrypt)
___
$code.=<<___;
.align 256
AES_Td:
___
&_data_word(
0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
$code.=<<___;
.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
.type AES_Td,#object
.size AES_Td,(.-AES_Td)
.align 64
.skip 16
_sparcv9_AES_decrypt:
save %sp,-$frame-$locals,%sp
stx %i7,[%sp+$bias+$frame+0] ! off-load return address
ld [$key+240],$rounds
ld [$key+0],$t0
ld [$key+4],$t1 !
ld [$key+8],$t2
ld [$key+12],$t3
srl $rounds,1,$rounds
xor $t0,$s0,$s0
ld [$key+16],$t0
xor $t1,$s1,$s1
ld [$key+20],$t1
srl $s0,21,$acc0 !
xor $t2,$s2,$s2
ld [$key+24],$t2
xor $t3,$s3,$s3
and $acc0,2040,$acc0
ld [$key+28],$t3
srl $s3,13,$acc1
nop
.Ldec_loop:
srl $s2,5,$acc2 !
and $acc1,2040,$acc1
ldx [$tbl+$acc0],$acc0
sll $s1,3,$acc3
and $acc2,2040,$acc2
ldx [$tbl+$acc1],$acc1
srl $s1,21,$acc4
and $acc3,2040,$acc3
ldx [$tbl+$acc2],$acc2 !
srl $s0,13,$acc5
and $acc4,2040,$acc4
ldx [$tbl+$acc3],$acc3
srl $s3,5,$acc6
and $acc5,2040,$acc5
ldx [$tbl+$acc4],$acc4
fmovs %f0,%f0
sll $s2,3,$acc7 !
and $acc6,2040,$acc6
ldx [$tbl+$acc5],$acc5
srl $s2,21,$acc8
and $acc7,2040,$acc7
ldx [$tbl+$acc6],$acc6
srl $s1,13,$acc9
and $acc8,2040,$acc8
ldx [$tbl+$acc7],$acc7 !
srl $s0,5,$acc10
and $acc9,2040,$acc9
ldx [$tbl+$acc8],$acc8
sll $s3,3,$acc11
and $acc10,2040,$acc10
ldx [$tbl+$acc9],$acc9
fmovs %f0,%f0
srl $s3,21,$acc12 !
and $acc11,2040,$acc11
ldx [$tbl+$acc10],$acc10
srl $s2,13,$acc13
and $acc12,2040,$acc12
ldx [$tbl+$acc11],$acc11
srl $s1,5,$acc14
and $acc13,2040,$acc13
ldx [$tbl+$acc12],$acc12 !
sll $s0,3,$acc15
and $acc14,2040,$acc14
ldx [$tbl+$acc13],$acc13
and $acc15,2040,$acc15
add $key,32,$key
ldx [$tbl+$acc14],$acc14
fmovs %f0,%f0
subcc $rounds,1,$rounds !
ldx [$tbl+$acc15],$acc15
bz,a,pn %icc,.Ldec_last
add $tbl,2048,$rounds
srlx $acc1,8,$acc1
xor $acc0,$t0,$t0
ld [$key+0],$s0
fmovs %f0,%f0
srlx $acc2,16,$acc2 !
xor $acc1,$t0,$t0
ld [$key+4],$s1
srlx $acc3,24,$acc3
xor $acc2,$t0,$t0
ld [$key+8],$s2
srlx $acc5,8,$acc5
xor $acc3,$t0,$t0
ld [$key+12],$s3 !
srlx $acc6,16,$acc6
xor $acc4,$t1,$t1
fmovs %f0,%f0
srlx $acc7,24,$acc7
xor $acc5,$t1,$t1
srlx $acc9,8,$acc9
xor $acc6,$t1,$t1
srlx $acc10,16,$acc10 !
xor $acc7,$t1,$t1
srlx $acc11,24,$acc11
xor $acc8,$t2,$t2
srlx $acc13,8,$acc13
xor $acc9,$t2,$t2
srlx $acc14,16,$acc14
xor $acc10,$t2,$t2
srlx $acc15,24,$acc15 !
xor $acc11,$t2,$t2
xor $acc12,$acc14,$acc14
xor $acc13,$t3,$t3
srl $t0,21,$acc0
xor $acc14,$t3,$t3
xor $acc15,$t3,$t3
srl $t3,13,$acc1
and $acc0,2040,$acc0 !
srl $t2,5,$acc2
and $acc1,2040,$acc1
ldx [$tbl+$acc0],$acc0
sll $t1,3,$acc3
and $acc2,2040,$acc2
ldx [$tbl+$acc1],$acc1
fmovs %f0,%f0
srl $t1,21,$acc4 !
and $acc3,2040,$acc3
ldx [$tbl+$acc2],$acc2
srl $t0,13,$acc5
and $acc4,2040,$acc4
ldx [$tbl+$acc3],$acc3
srl $t3,5,$acc6
and $acc5,2040,$acc5
ldx [$tbl+$acc4],$acc4 !
sll $t2,3,$acc7
and $acc6,2040,$acc6
ldx [$tbl+$acc5],$acc5
srl $t2,21,$acc8
and $acc7,2040,$acc7
ldx [$tbl+$acc6],$acc6
fmovs %f0,%f0
srl $t1,13,$acc9 !
and $acc8,2040,$acc8
ldx [$tbl+$acc7],$acc7
srl $t0,5,$acc10
and $acc9,2040,$acc9
ldx [$tbl+$acc8],$acc8
sll $t3,3,$acc11
and $acc10,2040,$acc10
ldx [$tbl+$acc9],$acc9 !
srl $t3,21,$acc12
and $acc11,2040,$acc11
ldx [$tbl+$acc10],$acc10
srl $t2,13,$acc13
and $acc12,2040,$acc12
ldx [$tbl+$acc11],$acc11
fmovs %f0,%f0
srl $t1,5,$acc14 !
and $acc13,2040,$acc13
ldx [$tbl+$acc12],$acc12
sll $t0,3,$acc15
and $acc14,2040,$acc14
ldx [$tbl+$acc13],$acc13
srlx $acc1,8,$acc1
and $acc15,2040,$acc15
ldx [$tbl+$acc14],$acc14 !
srlx $acc2,16,$acc2
xor $acc0,$s0,$s0
ldx [$tbl+$acc15],$acc15
srlx $acc3,24,$acc3
xor $acc1,$s0,$s0
ld [$key+16],$t0
fmovs %f0,%f0
srlx $acc5,8,$acc5 !
xor $acc2,$s0,$s0
ld [$key+20],$t1
srlx $acc6,16,$acc6
xor $acc3,$s0,$s0
ld [$key+24],$t2
srlx $acc7,24,$acc7
xor $acc4,$s1,$s1
ld [$key+28],$t3 !
srlx $acc9,8,$acc9
xor $acc5,$s1,$s1
ldx [$tbl+2048+0],%g0 ! prefetch td4
srlx $acc10,16,$acc10
xor $acc6,$s1,$s1
ldx [$tbl+2048+32],%g0 ! prefetch td4
srlx $acc11,24,$acc11
xor $acc7,$s1,$s1
ldx [$tbl+2048+64],%g0 ! prefetch td4
srlx $acc13,8,$acc13
xor $acc8,$s2,$s2
ldx [$tbl+2048+96],%g0 ! prefetch td4
srlx $acc14,16,$acc14 !
xor $acc9,$s2,$s2
ldx [$tbl+2048+128],%g0 ! prefetch td4
srlx $acc15,24,$acc15
xor $acc10,$s2,$s2
ldx [$tbl+2048+160],%g0 ! prefetch td4
srl $s0,21,$acc0
xor $acc11,$s2,$s2
ldx [$tbl+2048+192],%g0 ! prefetch td4
xor $acc12,$acc14,$acc14
xor $acc13,$s3,$s3
ldx [$tbl+2048+224],%g0 ! prefetch td4
and $acc0,2040,$acc0 !
xor $acc14,$s3,$s3
xor $acc15,$s3,$s3
ba .Ldec_loop
srl $s3,13,$acc1
.align 32
.Ldec_last:
srlx $acc1,8,$acc1 !
xor $acc0,$t0,$t0
ld [$key+0],$s0
srlx $acc2,16,$acc2
xor $acc1,$t0,$t0
ld [$key+4],$s1
srlx $acc3,24,$acc3
xor $acc2,$t0,$t0
ld [$key+8],$s2 !
srlx $acc5,8,$acc5
xor $acc3,$t0,$t0
ld [$key+12],$s3
srlx $acc6,16,$acc6
xor $acc4,$t1,$t1
srlx $acc7,24,$acc7
xor $acc5,$t1,$t1
srlx $acc9,8,$acc9 !
xor $acc6,$t1,$t1
srlx $acc10,16,$acc10
xor $acc7,$t1,$t1
srlx $acc11,24,$acc11
xor $acc8,$t2,$t2
srlx $acc13,8,$acc13
xor $acc9,$t2,$t2
srlx $acc14,16,$acc14 !
xor $acc10,$t2,$t2
srlx $acc15,24,$acc15
xor $acc11,$t2,$t2
xor $acc12,$acc14,$acc14
xor $acc13,$t3,$t3
srl $t0,24,$acc0
xor $acc14,$t3,$t3
xor $acc15,$t3,$t3 !
srl $t3,16,$acc1
srl $t2,8,$acc2
and $acc1,255,$acc1
ldub [$rounds+$acc0],$acc0
srl $t1,24,$acc4
and $acc2,255,$acc2
ldub [$rounds+$acc1],$acc1
srl $t0,16,$acc5 !
and $t1,255,$acc3
ldub [$rounds+$acc2],$acc2
ldub [$rounds+$acc3],$acc3
srl $t3,8,$acc6
and $acc5,255,$acc5
ldub [$rounds+$acc4],$acc4
fmovs %f0,%f0
srl $t2,24,$acc8 !
and $acc6,255,$acc6
ldub [$rounds+$acc5],$acc5
srl $t1,16,$acc9
and $t2,255,$acc7
ldub [$rounds+$acc6],$acc6
ldub [$rounds+$acc7],$acc7
fmovs %f0,%f0
srl $t0,8,$acc10 !
and $acc9,255,$acc9
ldub [$rounds+$acc8],$acc8
srl $t3,24,$acc12
and $acc10,255,$acc10
ldub [$rounds+$acc9],$acc9
srl $t2,16,$acc13
and $t3,255,$acc11
ldub [$rounds+$acc10],$acc10 !
srl $t1,8,$acc14
and $acc13,255,$acc13
ldub [$rounds+$acc11],$acc11
ldub [$rounds+$acc12],$acc12
and $acc14,255,$acc14
ldub [$rounds+$acc13],$acc13
and $t0,255,$acc15
ldub [$rounds+$acc14],$acc14 !
sll $acc0,24,$acc0
xor $acc3,$s0,$s0
ldub [$rounds+$acc15],$acc15
sll $acc1,16,$acc1
xor $acc0,$s0,$s0
ldx [%sp+$bias+$frame+0],%i7 ! restore return address
fmovs %f0,%f0
sll $acc2,8,$acc2 !
xor $acc1,$s0,$s0
sll $acc4,24,$acc4
xor $acc2,$s0,$s0
sll $acc5,16,$acc5
xor $acc7,$s1,$s1
sll $acc6,8,$acc6
xor $acc4,$s1,$s1
sll $acc8,24,$acc8 !
xor $acc5,$s1,$s1
sll $acc9,16,$acc9
xor $acc11,$s2,$s2
sll $acc10,8,$acc10
xor $acc6,$s1,$s1
sll $acc12,24,$acc12
xor $acc8,$s2,$s2
sll $acc13,16,$acc13 !
xor $acc9,$s2,$s2
sll $acc14,8,$acc14
xor $acc10,$s2,$s2
xor $acc12,$acc14,$acc14
xor $acc13,$s3,$s3
xor $acc14,$s3,$s3
xor $acc15,$s3,$s3
ret
restore
.type _sparcv9_AES_decrypt,#function
.size _sparcv9_AES_decrypt,(.-_sparcv9_AES_decrypt)
.align 32
.globl AES_decrypt
AES_decrypt:
or %o0,%o1,%g1
andcc %g1,3,%g0
bnz,pn %xcc,.Lunaligned_dec
save %sp,-$frame,%sp
ld [%i0+0],%o0
ld [%i0+4],%o1
ld [%i0+8],%o2
ld [%i0+12],%o3
1: call .+8
add %o7,AES_Td-1b,%o4
call _sparcv9_AES_decrypt
mov %i2,%o5
st %o0,[%i1+0]
st %o1,[%i1+4]
st %o2,[%i1+8]
st %o3,[%i1+12]
ret
restore
.align 32
.Lunaligned_dec:
ldub [%i0+0],%l0
ldub [%i0+1],%l1
ldub [%i0+2],%l2
sll %l0,24,%l0
ldub [%i0+3],%l3
sll %l1,16,%l1
ldub [%i0+4],%l4
sll %l2,8,%l2
or %l1,%l0,%l0
ldub [%i0+5],%l5
sll %l4,24,%l4
or %l3,%l2,%l2
ldub [%i0+6],%l6
sll %l5,16,%l5
or %l0,%l2,%o0
ldub [%i0+7],%l7
sll %l6,8,%l6
or %l5,%l4,%l4
ldub [%i0+8],%l0
or %l7,%l6,%l6
ldub [%i0+9],%l1
or %l4,%l6,%o1
ldub [%i0+10],%l2
sll %l0,24,%l0
ldub [%i0+11],%l3
sll %l1,16,%l1
ldub [%i0+12],%l4
sll %l2,8,%l2
or %l1,%l0,%l0
ldub [%i0+13],%l5
sll %l4,24,%l4
or %l3,%l2,%l2
ldub [%i0+14],%l6
sll %l5,16,%l5
or %l0,%l2,%o2
ldub [%i0+15],%l7
sll %l6,8,%l6
or %l5,%l4,%l4
or %l7,%l6,%l6
or %l4,%l6,%o3
1: call .+8
add %o7,AES_Td-1b,%o4
call _sparcv9_AES_decrypt
mov %i2,%o5
srl %o0,24,%l0
srl %o0,16,%l1
stb %l0,[%i1+0]
srl %o0,8,%l2
stb %l1,[%i1+1]
stb %l2,[%i1+2]
srl %o1,24,%l4
stb %o0,[%i1+3]
srl %o1,16,%l5
stb %l4,[%i1+4]
srl %o1,8,%l6
stb %l5,[%i1+5]
stb %l6,[%i1+6]
srl %o2,24,%l0
stb %o1,[%i1+7]
srl %o2,16,%l1
stb %l0,[%i1+8]
srl %o2,8,%l2
stb %l1,[%i1+9]
stb %l2,[%i1+10]
srl %o3,24,%l4
stb %o2,[%i1+11]
srl %o3,16,%l5
stb %l4,[%i1+12]
srl %o3,8,%l6
stb %l5,[%i1+13]
stb %l6,[%i1+14]
stb %o3,[%i1+15]
ret
restore
.type AES_decrypt,#function
.size AES_decrypt,(.-AES_decrypt)
___
# fmovs instructions substituting for FP nops were originally added
# to meet specific instruction alignment requirements to maximize ILP.
# As UltraSPARC T1, a.k.a. Niagara, has shared FPU, FP nops can have
# undesired effect, so just omit them and sacrifice some portion of
# percent in performance...
$code =~ s/fmovs.*$//gm;
print $code;
close STDOUT or die "error closing STDOUT: $!"; # ensure flush