x86/APX: optimize {nf} forms of ADD/SUB with specific immediates

Unlike for the legacy forms, where there's a difference in the resulting
EFLAGS, for the NF variants we can safely replace ones using 0x80 by the
respectively other insn while negating the immediate, saving 3 immediate
bytes (just 1 though for 16-bit operand size). Similarly we can replace
ones using 1 / -1 by INC/DEC (eliminating the immediate).
This commit is contained in:
Jan Beulich 2024-06-28 08:18:40 +02:00
parent f63d85cc78
commit 0868b8999b
8 changed files with 1648 additions and 22 deletions

View File

@ -5327,6 +5327,84 @@ optimize_encoding (void)
}
}
/* Try to shorten {nf} encodings, by shortening operand size or switching to
functionally identical encodings. */
static void
optimize_nf_encoding (void)
{
if (i.tm.base_opcode == 0x80
&& (i.tm.extension_opcode == 0 || i.tm.extension_opcode == 5)
&& i.suffix != BYTE_MNEM_SUFFIX
&& !i.types[1].bitfield.byte
&& !i.types[2].bitfield.byte
&& i.op[0].imms->X_op == O_constant
&& i.op[0].imms->X_add_number == 0x80)
{
/* Optimize: -O:
{nf} addw $0x80, ... -> {nf} subw $-0x80, ...
{nf} addl $0x80, ... -> {nf} subl $-0x80, ...
{nf} addq $0x80, ... -> {nf} subq $-0x80, ...
{nf} subw $0x80, ... -> {nf} addw $-0x80, ...
{nf} subl $0x80, ... -> {nf} addl $-0x80, ...
{nf} subq $0x80, ... -> {nf} addq $-0x80, ...
*/
i.tm.base_opcode |= 3;
i.tm.extension_opcode ^= 5;
i.tm.opcode_modifier.w = 0;
i.op[0].imms->X_add_number = -i.op[0].imms->X_add_number;
i.tm.operand_types[0].bitfield.imm8 = 0;
i.tm.operand_types[0].bitfield.imm8s = 1;
i.tm.operand_types[0].bitfield.imm16 = 0;
i.tm.operand_types[0].bitfield.imm32 = 0;
i.tm.operand_types[0].bitfield.imm32s = 0;
i.types[0] = i.tm.operand_types[0];
}
else if ((i.tm.base_opcode | 3) == 0x83
&& (i.tm.extension_opcode == 0 || i.tm.extension_opcode == 5)
&& i.op[0].imms->X_op == O_constant
&& (i.op[0].imms->X_add_number == 1
|| i.op[0].imms->X_add_number == -1
/* While for wider than byte operations immediates were suitably
adjusted earlier on, 0xff in the byte case needs covering
explicitly. */
|| (i.op[0].imms->X_add_number == 0xff
&& (i.suffix == BYTE_MNEM_SUFFIX
|| i.types[i.operands - 1].bitfield.byte))))
{
/* Optimize: -O:
{nf} add $1, ... -> {nf} inc ...
{nf} add $-1, ... -> {nf} dec ...
{nf} add $0xf...f, ... -> {nf} dec ...
{nf} sub $1, ... -> {nf} dec ...
{nf} sub $-1, ... -> {nf} inc ...
{nf} sub $0xf...f, ... -> {nf} inc ...
*/
i.tm.base_opcode = 0xfe;
i.tm.extension_opcode
= (i.op[0].imms->X_add_number == 1) != (i.tm.extension_opcode == 0);
i.tm.opcode_modifier.w = 1;
i.types[0] = i.types[1];
i.types[1] = i.types[2];
i.tm.operand_types[0] = i.tm.operand_types[1];
i.tm.operand_types[1] = i.tm.operand_types[2];
i.op[0] = i.op[1];
i.op[1] = i.op[2];
i.flags[0] = i.flags[1];
i.flags[1] = i.flags[2];
i.reloc[0] = i.reloc[1];
i.reloc[1] = NO_RELOC;
i.imm_operands = 0;
--i.operands;
}
}
static void
s_noopt (int dummy ATTRIBUTE_UNUSED)
{
@ -7206,7 +7284,11 @@ md_assemble (char *line)
}
if (optimize && !i.no_optimize && i.tm.opcode_modifier.optimize)
optimize_encoding ();
{
if (i.has_nf)
optimize_nf_encoding ();
optimize_encoding ();
}
/* Past optimization there's no need to distinguish encoding_evex,
encoding_evex512, and encoding_egpr anymore. */

View File

@ -701,6 +701,8 @@ Disassembly of section \.text:
\s*[a-f0-9]+:\s*62 d4 6c 1c 33 8c 80 23 01 00 00\s+\{nf\} xor edx,ecx,DWORD PTR \[r8\+rax\*4\+0x123\]
\s*[a-f0-9]+:\s*62 54 fc 0c 33 8c 80 23 01 00 00\s+\{nf\} xor r9,QWORD PTR \[r8\+rax\*4\+0x123\]
\s*[a-f0-9]+:\s*62 54 84 14 33 8c 80 23 01 00 00\s+\{nf\} xor r31,r9,QWORD PTR \[r8\+rax\*4\+0x123\]
0[0-9a-f]+ <intel>:
\s*[a-f0-9]+:\s*62 f4 7c 0c 80 c3 7b\s+\{nf\} add bl,0x7b
\s*[a-f0-9]+:\s*62 f4 6c 1c 80 c3 7b\s+\{nf\} add dl,bl,0x7b
\s*[a-f0-9]+:\s*62 f4 7d 0c 83 c2 7b\s+\{nf\} add dx,0x7b

File diff suppressed because it is too large Load Diff

View File

@ -701,6 +701,8 @@ Disassembly of section \.text:
\s*[a-f0-9]+:\s*62 d4 6c 1c 33 8c 80 23 01 00 00\s+\{nf\} xor 0x123\(%r8,%rax,4\),%ecx,%edx
\s*[a-f0-9]+:\s*62 54 fc 0c 33 8c 80 23 01 00 00\s+\{nf\} xor 0x123\(%r8,%rax,4\),%r9
\s*[a-f0-9]+:\s*62 54 84 14 33 8c 80 23 01 00 00\s+\{nf\} xor 0x123\(%r8,%rax,4\),%r9,%r31
0[0-9a-f]+ <intel>:
\s*[a-f0-9]+:\s*62 f4 7c 0c 80 c3 7b\s+\{nf\} add\s+\$0x7b,%bl
\s*[a-f0-9]+:\s*62 f4 6c 1c 80 c3 7b\s+\{nf\} add\s+\$0x7b,%bl,%dl
\s*[a-f0-9]+:\s*62 f4 7d 0c 83 c2 7b\s+\{nf\} add\s+\$0x7b,%dx

View File

@ -697,7 +697,8 @@ _start:
{nf} xor 291(%r8, %rax, 4), %r9
{nf} xor 291(%r8, %rax, 4), %r9, %r31
.intel_syntax noprefix
.intel_syntax noprefix
intel:
{nf} add bl, 123
{nf} add dl, bl, 123
{nf} add dx, 123
@ -1377,3 +1378,58 @@ _start:
{nf} xor edx, ecx, DWORD PTR [r8+rax*4+291]
{nf} xor r9, QWORD PTR [r8+rax*4+291]
{nf} xor r31, r9, QWORD PTR [r8+rax*4+291]
.att_syntax prefix
optimize:
.irp op, add, sub
{nf} \op $128, %bl
{nf} \op $128, %bl, %dl
{nf} \op $128, %dx
{nf} \op $128, %dx, %ax
{nf} \op $128, %ecx
{nf} \op $128, %ecx, %edx
{nf} \op $128, %r9
{nf} \op $128, %r9, %r31
{nf} \op\()b $128, (%rax)
{nf} \op $128, (%rax), %bl
{nf} \op\()w $128, (%rax)
{nf} \op $128, (%rax), %dx
{nf} \op\()l $128, (%rax)
{nf} \op $128, (%rax), %ecx
{nf} \op\()q $128, (%rax)
{nf} \op $128, (%rax), %r9
{nf} \op $1, %bl
{nf} \op $1, %bl, %dl
{nf} \op $1, %dx
{nf} \op $1, %dx, %ax
{nf} \op $1, %ecx
{nf} \op $1, %ecx, %edx
{nf} \op $1, %r9
{nf} \op $1, %r9, %r31
{nf} \op\()b $1, (%rax)
{nf} \op $1, (%rax), %bl
{nf} \op\()w $1, (%rax)
{nf} \op $1, (%rax), %dx
{nf} \op\()l $1, (%rax)
{nf} \op $1, (%rax), %ecx
{nf} \op\()q $1, (%rax)
{nf} \op $1, (%rax), %r9
{nf} \op $0xff, %bl
{nf} \op $-1, %bl, %dl
{nf} \op $0xffff, %dx
{nf} \op $-1, %dx, %ax
{nf} \op $0xffffffff, %ecx
{nf} \op $-1, %ecx, %edx
{nf} \op $-1, %r9
{nf} \op $-1, %r9, %r31
{nf} \op\()b $0xff, (%rax)
{nf} \op $-1, (%rax), %bl
{nf} \op\()w $0xffff, (%rax)
{nf} \op $-1, (%rax), %dx
{nf} \op\()l $0xffffffff, (%rax)
{nf} \op $-1, (%rax), %ecx
{nf} \op\()q $-1, (%rax)
{nf} \op $-1, (%rax), %r9
.endr

View File

@ -393,6 +393,7 @@ run_dump_test "x86-64-apx-jmpabs-intel"
run_dump_test "x86-64-apx-jmpabs-inval"
run_dump_test "x86-64-apx-nf"
run_dump_test "x86-64-apx-nf-intel"
run_dump_test "x86-64-apx-nf-optimize"
run_dump_test "x86-64-apx-zu"
run_dump_test "x86-64-apx-zu-intel"
run_list_test "x86-64-apx-zu-inval"

View File

@ -312,25 +312,25 @@ sti, 0xfb, 0, NoSuf, {}
// Arithmetic.
<alu2:opc:c:optz:optt:opti:nf, +
add:0:C::::NF, +
or:1:C::Optimize::NF, +
adc:2:C::::, +
sbb:3:::::, +
and:4:C::Optimize:Optimize:NF, +
sub:5::Optimize:::NF, +
xor:6:C:Optimize:::NF>
<alu2:opc:c:optz:optt:opti:optiE:nf, +
add:0:C::::Optimize:NF, +
or:1:C::Optimize:::NF, +
adc:2:C:::::, +
sbb:3::::::, +
and:4:C::Optimize:Optimize::NF, +
sub:5::Optimize:::Optimize:NF, +
xor:6:C:Optimize::::NF>
<alu2>, <alu2:opc> << 3, APX_F, D|<alu2:c>|W|CheckOperandSize|Modrm|No_sSuf|DstVVVV|EVexMap4|<alu2:nf>|<alu2:optz>, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
<alu2>, <alu2:opc> << 3, 0, D|W|CheckOperandSize|Modrm|No_sSuf|HLEPrefixLock|<alu2:optz>|<alu2:optt>, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
<alu2>, <alu2:opc> << 3, APX_F, D|W|CheckOperandSize|Modrm|No_sSuf|EVexMap4|<alu2:nf>, { Reg8|Reg16|Reg32|Reg64, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
<alu2>, 0x83/<alu2:opc>, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVexMap4|<alu2:nf>, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
<alu2>, 0x83/<alu2:opc>, APX_F, Modrm|CheckOperandSize|No_bSuf|No_sSuf|DstVVVV|EVexMap4|<alu2:nf>|<alu2:optiE>, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg16|Reg32|Reg64 }
<alu2>, 0x83/<alu2:opc>, 0, Modrm|No_bSuf|No_sSuf|HLEPrefixLock|<alu2:opti>, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex }
<alu2>, 0x83/<alu2:opc>, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|<alu2:nf>, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex }
<alu2>, 0x83/<alu2:opc>, APX_F, Modrm|No_bSuf|No_sSuf|EVexMap4|<alu2:nf>|<alu2:optiE>, { Imm8S, Reg16|Reg32|Reg64|Unspecified|BaseIndex }
<alu2>, 0x04 | (<alu2:opc> << 3), 0, W|No_sSuf|<alu2:opti>, { Imm8|Imm16|Imm32|Imm32S, Acc|Byte|Word|Dword|Qword }
<alu2>, 0x80/<alu2:opc>, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVexMap4|<alu2:nf>, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
<alu2>, 0x80/<alu2:opc>, APX_F, W|Modrm|CheckOperandSize|No_sSuf|DstVVVV|EVexMap4|<alu2:nf>|<alu2:optiE>, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex, Reg8|Reg16|Reg32|Reg64 }
<alu2>, 0x80/<alu2:opc>, 0, W|Modrm|No_sSuf|HLEPrefixLock|<alu2:opti>, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
<alu2>, 0x80/<alu2:opc>, APX_F, W|Modrm|EVexMap4|No_sSuf|<alu2:nf>, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
<alu2>, 0x80/<alu2:opc>, APX_F, W|Modrm|EVexMap4|No_sSuf|<alu2:nf>|<alu2:optiE>, { Imm8|Imm16|Imm32|Imm32S, Reg8|Reg16|Reg32|Reg64|Unspecified|BaseIndex }
<alu2>

View File

@ -826,7 +826,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 1, 0 } } } },
{ MN_add, 0x83, 3, SPACE_EVEXMAP4, 0,
{ 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -848,7 +848,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 1, 0 } } } },
{ MN_add, 0x83, 2, SPACE_EVEXMAP4, 0,
{ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -868,7 +868,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 0, 0 } } } },
{ MN_add, 0x80, 3, SPACE_EVEXMAP4, 0,
{ 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -890,7 +890,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 1, 0 } } } },
{ MN_add, 0x80, 2, SPACE_EVEXMAP4, 0,
{ 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -1356,7 +1356,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 1, 0 } } } },
{ MN_sub, 0x83, 3, SPACE_EVEXMAP4, 5,
{ 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -1378,7 +1378,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 1, 0 } } } },
{ MN_sub, 0x83, 2, SPACE_EVEXMAP4, 5,
{ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -1398,7 +1398,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 0, 0 } } } },
{ MN_sub, 0x80, 3, SPACE_EVEXMAP4, 5,
{ 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },
@ -1420,7 +1420,7 @@ static const insn_template i386_optab[] =
0, 0, 0, 0, 1, 0 } } } },
{ MN_sub, 0x80, 2, SPACE_EVEXMAP4, 5,
{ 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
1, 0 },
{ { 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0 } },
{ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } },