Merge pull request #2186 from Sonicadvance1/aarch64_optimize_paired_slowmem

[AArch64] Optimize slowmem paired stores.
This commit is contained in:
Ryan Houdek 2015-03-15 14:37:21 -05:00
commit 5e0b9179db
3 changed files with 219 additions and 212 deletions

View file

@ -2618,6 +2618,7 @@ void ARM64FloatEmitter::SMOV(u8 size, ARM64Reg Rd, ARM64Reg Rn, u8 index)
bool b64Bit = Is64Bit(Rd);
_assert_msg_(DYNA_REC, Rd < SP, "%s destination must be a GPR!", __FUNCTION__);
_assert_msg_(DYNA_REC, size != 64, "%s doesn't support 64bit destination. Use UMOV!", __FUNCTION__);
_assert_msg_(DYNA_REC, !b64Bit && size != 32, "%s doesn't support 32bit move to 32bit register. Use UMOV!", __FUNCTION__);
u32 imm5 = 0;
if (size == 8)

View file

@ -94,9 +94,18 @@ void JitArm64::psq_st(UGeckoInstruction inst)
fpr.Lock(Q0, Q1);
ARM64Reg arm_addr = gpr.R(inst.RA);
ARM64Reg VS = fpr.R(inst.RS);
ARM64Reg scale_reg = W0;
ARM64Reg addr_reg = W1;
ARM64Reg type_reg = gpr.GetReg();
ARM64Reg type_reg = W2;
BitSet32 gprs_in_use = gpr.GetCallerSavedUsed();
BitSet32 fprs_in_use = fpr.GetCallerSavedUsed();
// Wipe the registers we are using as temporaries
gprs_in_use &= BitSet32(~0x40000007);
fprs_in_use &= BitSet32(~3);
LDR(INDEX_UNSIGNED, scale_reg, X29, PPCSTATE_OFF(spr[SPR_GQR0 + inst.I]));
@ -118,13 +127,35 @@ void JitArm64::psq_st(UGeckoInstruction inst)
if (update)
MOV(arm_addr, addr_reg);
ARM64Reg VS = fpr.R(inst.RS);
m_float_emit.FCVTN(32, D0, VS);
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]);
LDR(X30, X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(X30);
gpr.Unlock(W0, W1, W2, W30, type_reg);
// Inline address check
{
TST(addr_reg, 6, 1);
FixupBranch argh = B(CC_NEQ);
// Fast
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[inst.W * 8]);
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
BLR(EncodeRegTo64(type_reg));
FixupBranch continue1 = B();
SetJumpTarget(argh);
// Slow
MOVI2R(X30, (u64)&asm_routines.pairedStoreQuantized[16 + inst.W * 8]);
LDR(EncodeRegTo64(type_reg), X30, ArithOption(EncodeRegTo64(type_reg), true));
ABI_PushRegisters(gprs_in_use);
m_float_emit.ABI_PushRegisters(fprs_in_use, X30);
BLR(EncodeRegTo64(type_reg));
m_float_emit.ABI_PopRegisters(fprs_in_use, X30);
ABI_PushRegisters(gprs_in_use);
SetJumpTarget(continue1);
}
gpr.Unlock(W0, W1, W2, W30);
fpr.Unlock(Q0, Q1);
}

View file

@ -107,7 +107,6 @@ void JitArm64AsmRoutineManager::GenerateCommon()
ARM64Reg addr_reg = X1;
ARM64Reg scale_reg = X0;
ARM64FloatEmitter float_emit(this);
const u32 GPR_CALLER_SAVE = 0x6007FFFF;
const u8* loadPairedIllegal = GetCodePtr();
BRK(100);
@ -263,299 +262,255 @@ void JitArm64AsmRoutineManager::GenerateCommon()
// Stores
const u8* storePairedIllegal = GetCodePtr();
BRK(0x101);
const u8* storePairedFloat = GetCodePtr();
const u8* storePairedFloat;
const u8* storePairedFloatSlow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storePairedFloat = GetCodePtr();
float_emit.REV32(8, D0, D0);
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(64, Q0, 0, addr_reg, SP);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storePairedFloatSlow = GetCodePtr();
float_emit.UMOV(64, X0, Q0, 0);
ORR(X0, SP, X0, ArithOption(X0, ST_ROR, 32));
MOVI2R(X30, (u64)PowerPC::Write_U64);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)PowerPC::Write_U64);
BR(X2);
}
const u8* storePairedU8 = GetCodePtr();
const u8* storePairedU8;
const u8* storePairedU8Slow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1, 0);
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
auto emit_quantize = [this, &float_emit, scale_reg]()
{
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1, 0);
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
};
storePairedU8 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(16, Q0, 0, addr_reg, SP);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storePairedU8Slow = GetCodePtr();
emit_quantize();
float_emit.UMOV(16, W0, Q0, 0);
REV16(W0, W0);
MOVI2R(X30, (u64)PowerPC::Write_U16);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)PowerPC::Write_U16);
BR(X2);
}
const u8* storePairedS8 = GetCodePtr();
const u8* storePairedS8;
const u8* storePairedS8Slow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1, 0);
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
auto emit_quantize = [this, &float_emit, scale_reg]()
{
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1, 0);
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
};
storePairedS8 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(16, Q0, 0, addr_reg, SP);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storePairedS8Slow = GetCodePtr();
emit_quantize();
float_emit.UMOV(16, W0, Q0, 0);
REV16(W0, W0);
MOVI2R(X30, (u64)PowerPC::Write_U16);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)PowerPC::Write_U16);
BR(X2);
}
const u8* storePairedU16 = GetCodePtr();
const u8* storePairedU16;
const u8* storePairedU16Slow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
auto emit_quantize = [this, &float_emit, scale_reg]()
{
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1, 0);
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.REV16(8, D0, D0);
};
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1, 0);
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.REV16(8, D0, D0);
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storePairedU16 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(32, Q0, 0, addr_reg, SP);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storePairedU16Slow = GetCodePtr();
emit_quantize();
float_emit.REV32(8, D0, D0);
float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)PowerPC::Write_U32);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)PowerPC::Write_U32);
BR(X2);
}
const u8* storePairedS16 = GetCodePtr(); // Used by Viewtiful Joe's intro movie
const u8* storePairedS16; // Used by Viewtiful Joe's intro movie
const u8* storePairedS16Slow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
auto emit_quantize = [this, &float_emit, scale_reg]()
{
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1, 0);
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.REV16(8, D0, D0);
};
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1, 0);
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.REV16(8, D0, D0);
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storePairedS16 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(32, Q0, 0, addr_reg, SP);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storePairedS16Slow = GetCodePtr();
emit_quantize();
float_emit.REV32(8, D0, D0);
float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)PowerPC::Write_U32);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)PowerPC::Write_U32);
BR(X2);
}
const u8* storeSingleFloat = GetCodePtr();
const u8* storeSingleFloat;
const u8* storeSingleFloatSlow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storeSingleFloat = GetCodePtr();
float_emit.REV32(8, D0, D0);
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.STR(32, INDEX_UNSIGNED, D0, addr_reg, 0);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
storeSingleFloatSlow = GetCodePtr();
float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)&PowerPC::Write_U32);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
MOVI2R(X2, (u64)&PowerPC::Write_U32);
BR(X2);
}
const u8* storeSingleU8 = GetCodePtr(); // Used by MKWii
const u8* storeSingleU8; // Used by MKWii
const u8* storeSingleU8Slow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
auto emit_quantize = [this, &float_emit, scale_reg]()
{
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
};
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storeSingleU8 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(8, Q0, 0, addr_reg);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)&PowerPC::Write_U8);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
storeSingleU8Slow = GetCodePtr();
emit_quantize();
float_emit.UMOV(8, W0, Q0, 0);
MOVI2R(X2, (u64)&PowerPC::Write_U8);
BR(X2);
}
const u8* storeSingleS8 = GetCodePtr();
const u8* storeSingleS8;
const u8* storeSingleS8Slow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
auto emit_quantize = [this, &float_emit, scale_reg]()
{
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
};
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
float_emit.XTN(8, D0, D0);
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storeSingleS8 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.ST1(8, Q0, 0, addr_reg);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.SMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)&PowerPC::Write_U8);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
storeSingleS8Slow = GetCodePtr();
emit_quantize();
float_emit.SMOV(8, W0, Q0, 0);
MOVI2R(X2, (u64)&PowerPC::Write_U8);
BR(X2);
}
const u8* storeSingleU16 = GetCodePtr(); // Used by MKWii
const u8* storeSingleU16; // Used by MKWii
const u8* storeSingleU16Slow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
auto emit_quantize = [this, &float_emit, scale_reg]()
{
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
};
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZU(32, D0, D0);
float_emit.XTN(16, D0, D0);
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storeSingleU16 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.REV16(8, D0, D0);
float_emit.ST1(16, Q0, 0, addr_reg);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.UMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)&PowerPC::Write_U16);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
storeSingleU16Slow = GetCodePtr();
emit_quantize();
float_emit.UMOV(16, W0, Q0, 0);
MOVI2R(X2, (u64)&PowerPC::Write_U16);
BR(X2);
}
const u8* storeSingleS16 = GetCodePtr();
const u8* storeSingleS16;
const u8* storeSingleS16Slow;
{
BitSet32 gprs(GPR_CALLER_SAVE & ~7); // All except X0/X1/X2
BitSet32 fprs(~3); // All except Q0/Q1
auto emit_quantize = [this, &float_emit, scale_reg]()
{
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
};
MOVI2R(X2, (u64)&m_quantizeTableS);
ADD(scale_reg, X2, scale_reg, ArithOption(scale_reg, ST_LSL, 3));
float_emit.LDR(32, INDEX_UNSIGNED, D1, scale_reg, 0);
float_emit.FMUL(32, D0, D0, D1);
float_emit.FCVTZS(32, D0, D0);
float_emit.XTN(16, D0, D0);
TST(DecodeReg(addr_reg), 6, 1);
FixupBranch argh = B(CC_NEQ);
storeSingleS16 = GetCodePtr();
emit_quantize();
MOVK(addr_reg, ((u64)Memory::logical_base >> 32) & 0xFFFF, SHIFT_32);
float_emit.REV16(8, D0, D0);
float_emit.ST1(16, Q0, 0, addr_reg);
RET(X30);
SetJumpTarget(argh);
ABI_PushRegisters(gprs);
float_emit.ABI_PushRegisters(fprs, X3);
float_emit.SMOV(32, W0, Q0, 0);
MOVI2R(X30, (u64)&PowerPC::Write_U16);
BLR(X30);
float_emit.ABI_PopRegisters(fprs, X3);
ABI_PopRegisters(gprs);
RET(X30);
storeSingleS16Slow = GetCodePtr();
emit_quantize();
float_emit.SMOV(16, W0, Q0, 0);
MOVI2R(X2, (u64)&PowerPC::Write_U16);
BR(X2);
}
pairedStoreQuantized = reinterpret_cast<const u8**>(const_cast<u8*>(AlignCode16()));
ReserveCodeSpace(16 * sizeof(u8*));
ReserveCodeSpace(32 * sizeof(u8*));
// Fast
pairedStoreQuantized[0] = storePairedFloat;
pairedStoreQuantized[1] = storePairedIllegal;
pairedStoreQuantized[2] = storePairedIllegal;
@ -573,4 +528,24 @@ void JitArm64AsmRoutineManager::GenerateCommon()
pairedStoreQuantized[13] = storeSingleU16;
pairedStoreQuantized[14] = storeSingleS8;
pairedStoreQuantized[15] = storeSingleS16;
// Slow
pairedStoreQuantized[16] = storePairedFloatSlow;
pairedStoreQuantized[17] = storePairedIllegal;
pairedStoreQuantized[18] = storePairedIllegal;
pairedStoreQuantized[19] = storePairedIllegal;
pairedStoreQuantized[20] = storePairedU8Slow;
pairedStoreQuantized[21] = storePairedU16Slow;
pairedStoreQuantized[22] = storePairedS8Slow;
pairedStoreQuantized[23] = storePairedS16Slow;
pairedStoreQuantized[24] = storeSingleFloatSlow;
pairedStoreQuantized[25] = storePairedIllegal;
pairedStoreQuantized[26] = storePairedIllegal;
pairedStoreQuantized[27] = storePairedIllegal;
pairedStoreQuantized[28] = storeSingleU8Slow;
pairedStoreQuantized[29] = storeSingleU16Slow;
pairedStoreQuantized[30] = storeSingleS8Slow;
pairedStoreQuantized[31] = storeSingleS16Slow;
}