diff --git a/lightening.am b/lightening.am index 2c9089ead..ba55f2c7f 100644 --- a/lightening.am +++ b/lightening.am @@ -40,6 +40,7 @@ lightening_extra_files = \ $(lightening)/lightening/mips.h \ $(lightening)/lightening/ppc.h \ $(lightening)/lightening/x86.h \ + $(lightening)/lightening/riscv.h \ \ $(lightening)/lightening/aarch64.c \ $(lightening)/lightening/aarch64-cpu.c \ @@ -55,4 +56,7 @@ lightening_extra_files = \ $(lightening)/lightening/ppc-fpu.c \ $(lightening)/lightening/x86.c \ $(lightening)/lightening/x86-cpu.c \ - $(lightening)/lightening/x86-sse.c + $(lightening)/lightening/x86-sse.c \ + $(lightening)/lightening/riscv.c \ + $(lightening)/lightening/riscv-cpu.c \ + $(lightening)/lightening/riscv-fpu.c diff --git a/lightening.h b/lightening.h index 1b296bd66..b364e18cc 100644 --- a/lightening.h +++ b/lightening.h @@ -77,6 +77,8 @@ jit_same_fprs (jit_fpr_t a, jit_fpr_t b) # include "lightening/aarch64.h" #elif defined(__s390__) || defined(__s390x__) # include "lightening/s390.h" +#elif defined(__riscv__) || defined(__riscv) +# include "lightening/riscv.h" #endif enum jit_reloc_kind diff --git a/lightening/endian.h b/lightening/endian.h index 3b34a1518..e3689a117 100644 --- a/lightening/endian.h +++ b/lightening/endian.h @@ -38,6 +38,8 @@ # else # define __WORDSIZE 64 # endif +# elif defined(__riscv_xlen) +# define __WORDSIZE __riscv_xlen /* riscv */ # else /* From FreeBSD 9.1 stdint.h */ # if defined(UINTPTR_MAX) && defined(UINT64_MAX) && \ (UINTPTR_MAX == UINT64_MAX) diff --git a/lightening/lightening.c b/lightening/lightening.c index afc6fd493..c66b3a132 100644 --- a/lightening/lightening.c +++ b/lightening/lightening.c @@ -271,6 +271,22 @@ get_temp_gpr(jit_state_t *_jit) #ifdef JIT_TMP1 case 1: return JIT_TMP1; +#endif +#ifdef JIT_TMP2 + case 2: + return JIT_TMP2; +#endif +#ifdef JIT_TMP3 + case 3: + return JIT_TMP3; +#endif +#ifdef JIT_TMP4 + case 4: + return JIT_TMP4; +#endif +#ifdef JIT_TMP5 + case 5: + return JIT_TMP5; #endif default: abort(); @@ -561,6 +577,8 @@ jit_emit_addr(jit_state_t *j) # include "aarch64.c" #elif defined(__s390__) || defined(__s390x__) # include "s390.c" +#elif defined(__riscv__) || defined(__riscv) +# include "riscv.c" #endif #define JIT_IMPL_0(stem, ret) \ @@ -1167,6 +1185,9 @@ static const jit_gpr_t user_callee_save_gprs[] = { #endif #ifdef JIT_V9 , JIT_V9 +#endif +#ifdef JIT_V10 + , JIT_V10 #endif }; @@ -1195,6 +1216,18 @@ static const jit_fpr_t user_callee_save_fprs[] = { #ifdef JIT_VF7 , JIT_VF7 #endif +#ifdef JIT_VF8 + , JIT_VF8 +#endif +#ifdef JIT_VF9 + , JIT_VF9 +#endif +#ifdef JIT_VF10 + , JIT_VF10 +#endif +#ifdef JIT_VF11 + , JIT_VF11 +#endif }; #define ARRAY_SIZE(X) (sizeof (X)/sizeof ((X)[0])) diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c new file mode 100644 index 000000000..d9d36f30b --- /dev/null +++ b/lightening/riscv-cpu.c @@ -0,0 +1,2470 @@ +/* + * Copyright (C) 2012-2024 Free Software Foundation, Inc. + * + * This file is part of GNU lightning. + * + * GNU lightning is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU lightning is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * Authors: + * Paulo Cesar Pereira de Andrade + * Ekaitz Zarraga + */ +#define stack_framesize (200 + 64) +#define simm6_p(im) ((im) <= 31 && (im) >= -32) +#define simm12_p(im) ((im) <= 2047 && (im) >= -2048) +#define simm20_p(im) ((im) <= 524287 && (im) >= -524288) +#define simm32_p(im) ((im) <= 2147483647LL && (im) >= -2147483648LL) + +typedef union { + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t funct7 : 7; + } R; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t rl : 1; + uint32_t aq : 1; + uint32_t funct5 : 5; + } R4; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t imm11_0 : 12; + } I; +#if __WORDSIZE == 64 + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t shamt : 6; + uint32_t imm6_0 : 6; + } IS; +#endif + struct { + uint32_t opcode : 7; + uint32_t imm4_0 : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t imm11_5 : 7; + } S; + struct { + uint32_t opcode : 7; + uint32_t imm11 : 1; + uint32_t imm4_1 : 4; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t imm10_5 : 6; + uint32_t imm12 : 1; + } B; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t imm31_12 : 20; + } U; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t imm19_12 : 8; + uint32_t imm11 : 1; + uint32_t imm10_1 : 10; + uint32_t imm20 : 1; + } J; + uint32_t w; +} instr_t; + + +// TODO: Compressed instruction support + +static uint32_t +Rtype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t rs2, + int32_t fct2) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(!(fct2 & ~0x7f)); + i.R.opcode = op; + i.R.rd = rd; + i.R.funct3 = fct; + i.R.rs1 = rs1; + i.R.rs2 = rs2; + i.R.funct7 = fct2; + return i.w; +} + +static uint32_t +R4type(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t rs2, + int32_t aq, int32_t rl, int32_t fct5) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(!(fct5 & ~0x1f)); + assert(!(aq & ~0x01)); + assert(!(rl & ~0x01)); + i.R4.opcode = op; + i.R4.rd = rd; + i.R4.funct3 = fct; + i.R4.rs1 = rs1; + i.R4.rs2 = rs2; + i.R4.aq = aq; + i.R4.rl = rl; + i.R4.funct5 = fct5; + return i.w; +} + +static uint32_t +Itype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(simm12_p(imm)); + i.I.opcode = op; + i.I.rd = rd; + i.I.funct3 = fct; + i.I.rs1 = rs1; + i.I.imm11_0 = imm; + return i.w; +} + +# if __WORDSIZE == 64 + static uint32_t +IStype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t sh, + int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(sh & ~0x3f)); + assert(simm6_p(imm)); + i.IS.opcode = op; + i.IS.rd = rd; + i.IS.funct3 = fct; + i.IS.rs1 = rs1; + i.IS.shamt = sh; + i.IS.imm6_0 = imm; + return i.w; +} +# endif + +static uint32_t +Stype(int32_t op, int32_t fct, int32_t rs1, int32_t rs2, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(simm12_p(imm)); + i.S.opcode = op; + i.S.imm4_0 = imm & 0x1f; + i.S.funct3 = fct; + i.S.rs1 = rs1; + i.S.rs2 = rs2; + i.S.imm11_5 = (imm >> 5) & 0x7f; + return i.w; +} + +static uint32_t +Btype(int32_t op, int32_t fct, int32_t rs1, int32_t rs2, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(!(imm & 1) && simm12_p(imm)); + i.B.opcode = op; + i.B.imm11 = (imm >> 11) & 0x1; + i.B.imm4_1 = (imm >> 1) & 0xf; + i.B.funct3 = fct; + i.B.rs1 = rs1; + i.B.rs2 = rs2; + i.B.imm10_5 = (imm >> 5) & 0x3f; + i.B.imm12 = (imm >> 12) & 0x1; + return i.w; +} + +static uint32_t +Utype(int32_t op, int32_t rd, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(simm20_p(imm)); + i.U.opcode = op; + i.U.rd = rd; + i.U.imm31_12= imm; + return i.w; +} + +static uint32_t +Jtype(int32_t op, int32_t rd, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(imm & 1) && imm <= 1048575 && imm >= -1048576); + i.J.opcode = op; + i.J.rd = rd; + i.J.imm19_12= (imm >> 12) & 0xff; + i.J.imm11 = (imm >> 11) & 0x1; + i.J.imm10_1 = (imm >> 1) & 0x3ff; + i.J.imm20 = (imm >> 20) & 0x1; + return i.w; +} + +/* + * RV32I Base Instruction Set + */ +#define _LUI(rd, imm) Utype(55, rd, imm) +#define _AUIPC(rd, imm) Utype(23, rd, imm) +#define _JAL(rd, imm) Jtype(111, rd, imm) +#define _JALR(rd, rs1, imm) Itype(103, rd, 0, rs1, imm) +#define _BEQ(rs1, rs2, imm) Btype(99, 0, rs1, rs2, imm) +#define _BNE(rs1, rs2, imm) Btype(99, 1, rs1, rs2, imm) +#define _BLT(rs1, rs2, imm) Btype(99, 4, rs1, rs2, imm) +#define _BGE(rs1, rs2, imm) Btype(99, 5, rs1, rs2, imm) +#define _BLTU(rs1, rs2, imm) Btype(99, 6, rs1, rs2, imm) +#define _BGEU(rs1, rs2, imm) Btype(99, 7, rs1, rs2, imm) +#define _LB(rd, rs1, imm) Itype(3, rd, 0, rs1, imm) +#define _LH(rd, rs1, imm) Itype(3, rd, 1, rs1, imm) +#define _LW(rd, rs1, imm) Itype(3, rd, 2, rs1, imm) +#define _LBU(rd, rs1, imm) Itype(3, rd, 4, rs1, imm) +#define _LHU(rd, rs1, imm) Itype(3, rd, 5, rs1, imm) +#define _SB(rs1, rs2, imm) Stype(35, 0, rs1, rs2, imm) +#define _SH(rs1, rs2, imm) Stype(35, 1, rs1, rs2, imm) +#define _SW(rs1, rs2, imm) Stype(35, 2, rs1, rs2, imm) +#define _ADDI(rd, rs1, imm) Itype(19, rd, 0, rs1, imm) +#define _SLTI(rd, rs1, imm) Itype(19, rd, 2, rs1, imm) +#define _SLTIU(rd, rs1, imm) Itype(19, rd, 3, rs1, imm) +#define _XORI(rd, rs1, imm) Itype(19, rd, 4, rs1, imm) +#define _ORI(rd, rs1, imm) Itype(19, rd, 6, rs1, imm) +#define _ANDI(rd, rs1, imm) Itype(19, rd, 7, rs1, imm) +#if __WORDSIZE == 32 +# define _SLLI(rd, rs1, imm) Rtype(19, rd, 1, rs1, imm, 0) +# define _SRLI(rd, rs1, imm) Rtype(19, rd, 5, rs1, imm, 0) +# define _SRAI(rd, rs1, imm) Rtype(19, rd, 5, rs1, imm, 32) +#endif +#define _ADD(rd, rs1, rs2) Rtype(51, rd, 0, rs1, rs2, 0) +#define _SUB(rd, rs1, rs2) Rtype(51, rd, 0, rs1, rs2, 32) +#define _SLL(rd, rs1, rs2) Rtype(51, rd, 1, rs1, rs2, 0) +#define _SLT(rd, rs1, rs2) Rtype(51, rd, 2, rs1, rs2, 0) +#define _SLTU(rd, rs1, rs2) Rtype(51, rd, 3, rs1, rs2, 0) +#define _XOR(rd, rs1, rs2) Rtype(51, rd, 4, rs1, rs2, 0) +#define _SRL(rd, rs1, rs2) Rtype(51, rd, 5, rs1, rs2, 0) +#define _SRA(rd, rs1, rs2) Rtype(51, rd, 5, rs1, rs2, 32) +#define _OR(rd, rs1, rs2) Rtype(51, rd, 6, rs1, rs2, 0) +#define _AND(rd, rs1, rs2) Rtype(51, rd, 7, rs1, rs2, 0) +#define _FENCE(imm) Itype( 15, 0, 0, 0, imm) +#define _FENCE_I(imm) Itype( 15, 0, 1, 0, imm) +#define _ECALL() Itype(115, 0, 0, 0, 0) +#define _EBREAK() Itype(115, 0, 0, 0, 1) +#define _CSRRW(rd, rs1, csr) Itype(115, rd, 1, rs1, csr) +#define _CSRRS(rd, rs1, csr) Itype(115, rd, 2, rs1, csr) +#define _CSRRC(rd, rs1, csr) Itype(115, rd, 3, rs1, csr) +#define _CSRRWI(rd, zimm, csr) Itype(115, rd, 5, zimm, csr) +#define _CSRRSI(rd, zimm, csr) Itype(115, rd, 6, zimm, csr) +#define _CSRRCI(rd, zimm, csr) Itype(115, rd, 7, zimm, csr) +/* + * RV64I Base Instruction Set (in addition to RV32I) + */ +#define _LWU(rd, rs1, imm) Itype(3, rd, 6, rs1, imm) +#define _LD(rd, rs1, imm) Itype(3, rd, 3, rs1, imm) +#define _SD(rs1, rs2, imm) Stype(35, 3, rs1, rs2, imm) +#if __WORDSIZE == 64 +# define _SLLI(rd, rs1, sh) IStype(19, rd, 1, rs1, sh, 0) +# define _SRLI(rd, rs1, sh) IStype(19, rd, 5, rs1, sh, 0) +# define _SRAI(rd, rs1, sh) IStype(19, rd, 5, rs1, sh, 16) +#endif +#define _ADDIW(rd, rs1, imm) Itype(27, rd, 0, rs1, imm) +#define _SLLIW(rd, rs1, imm) Rtype(27, rd, 1, rs1, imm, 0) +#define _SRLIW(rd, rs1, imm) Rtype(27, rd, 3, rs1, imm, 0) +#define _SRAIW(rd, rs1, imm) Rtype(27, rd, 3, rs1, imm, 32) +#define _ADDW(rd, rs1, imm) Rtype(59, rd, 0, rs1, imm, 0) +#define _SUBW(rd, rs1, imm) Rtype(59, rd, 0, rs1, imm, 32) +#define _SLLW(rd, rs1, imm) Rtype(59, rd, 1, rs1, imm, 0) +#define _SRLW(rd, rs1, imm) Rtype(59, rd, 5, rs1, imm, 0) +#define _SRAW(rd, rs1, imm) Rtype(59, rd, 5, rs1, imm, 32) +/* + * RV32M Standard Extension + */ +#define _MUL(rd, rs1, rs2) Rtype(51, rd, 0, rs1, rs2, 1) +#define _MULH(rd, rs1, rs2) Rtype(51, rd, 1, rs1, rs2, 1) +#define _MULHSU(rd, rs1, rs2) Rtype(51, rd, 2, rs1, rs2, 1) +#define _MULHU(rd, rs1, rs2) Rtype(51, rd, 3, rs1, rs2, 1) +#define _DIV(rd, rs1, rs2) Rtype(51, rd, 4, rs1, rs2, 1) +#define _DIVU(rd, rs1, rs2) Rtype(51, rd, 5, rs1, rs2, 1) +#define _REM(rd, rs1, rs2) Rtype(51, rd, 6, rs1, rs2, 1) +#define _REMU(rd, rs1, rs2) Rtype(51, rd, 7, rs1, rs2, 1) +/* + * RV64M Standard Extension (in addition to RV32M) + */ +#define _MULW(rd, rs1, rs2) Rtype(59, rd, 0, rs1, rs2, 1) +#define _DIVW(rd, rs1, rs2) Rtype(59, rd, 4, rs1, rs2, 1) +#define _DIVUW(rd, rs1, rs2) Rtype(59, rd, 5, rs1, rs2, 1) +#define _REMW(rd, rs1, rs2) Rtype(59, rd, 6, rs1, rs2, 1) +#define _REMUW(rd, rs1, rs2) Rtype(59, rd, 7, rs1, rs2, 1) +/* + * RV32A Standard Extension + */ +#define _LR_W(rd, rs1, rl, aq) R4type(47, rd, 2, rs1, 0, rl, aq, 2) +#define _SC_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 3) +#define _AMOSWAP_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 1) +#define _AMOADD_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 0) +#define _AMOXOR_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 4) +#define _AMOAND_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 12) +#define _AMOOR_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 8) +#define _AMOMIN_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 16) +#define _AMOMAX_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 20) +#define _AMOMINU_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 24) +#define _AMOMAXU_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 28) +/* + * RV64A Standard Extension (in addition to RV32A) + */ +#define _LR_D(rd, rs1, rl, aq) R4type(47, rd, 3, rs1, 0, rl, aq, 2) +#define _SC_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 3) +#define _AMOSWAP_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 1) +#define _AMOADD_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 0) +#define _AMOXOR_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 4) +#define _AMOAND_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 12) +#define _AMOOR_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 8) +#define _AMOMIN_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 16) +#define _AMOMAX_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 20) +#define _AMOMINU_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 24) +#define _AMOMAXU_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 28) +/* + * Pseudo Instructions + */ +#define _NOP() _ADDI((jit_gpr_regno(_ZERO)),\ + (jit_gpr_regno(_ZERO)), 0) +#define _MV(r0, r1) _ADDI(r0, r1, 0) +#define _NOT(r0, r1) _XORI(r0, r1, -1) +#define _NEG(r0, r1) _SUB(r0, (jit_gpr_regno(_ZERO)), r1) +#define _NEGW(r0, r1) _SUBW(r0, (jit_gpr_regno(_ZERO)), r1) +#define _SEXT_W(r0, r1) _ADDIW(r0, r1, 0) +#define _RET() _JALR((jit_gpr_regno(_ZERO)),\ + (jit_gpr_regno(_RA)), 0) + + + +// Help to make all easier +#define em_wp(jit, inst) emit_u32_with_pool(jit, inst) + +/* + * JIT INSTRUCTIONS + */ + +// Binary ALU operations +static void addr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void addcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void addxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +static void subr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void subcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void subxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +static void muli(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void mulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); + +static void divr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void divr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +static void remi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void remr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void remi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void remr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); + +static void andr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void andi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void orr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void xorr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void xori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void lshi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void lshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void rshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void rshi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0); +static void rshr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void rshi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0); + + +// Four operand ALU operations +static void qmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qmulr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qmuli(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); +static void qmuli_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); + +static void qdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qdivr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qdivi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); +static void qdivi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); + + +// Unary ALU operations +static void negr(jit_state_t *_jit, int32_t r0, int32_t r1); +static void comr(jit_state_t *_jit, int32_t r0, int32_t r1); + + +// Transfer operations +static void movr(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movi(jit_state_t *_jit, int32_t r0, jit_word_t i0); + +static uint64_t patch_load_from_pool(uint64_t instrs, uint32_t off); +static jit_reloc_t emit_load_from_pool(jit_state_t *_jit, uint64_t insts); +static jit_reloc_t mov_addr(jit_state_t *_jit, int32_t r0); +static jit_reloc_t movi_from_pool(jit_state_t *_jit, int32_t r0); + +static void extr_c(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_uc(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_s(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_us(jit_state_t *_jit, int32_t r0, int32_t r1); + +# if __WORDSIZE == 64 +static void extr_i(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_ui(jit_state_t *_jit, int32_t r0, int32_t r1); +#endif + + +// Branch instructions +static uint32_t patch_cc_jump(uint32_t inst, int32_t offset); +static jit_reloc_t emit_cc_jump(jit_state_t *_jit, uint32_t inst); + +static jit_reloc_t bltr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blti(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bltr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blti_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bler(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blei(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bler_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blei_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t beqr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t beqi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bger(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgei(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bger_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgei_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bgtr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgti(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bgtr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgti_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bner(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bnei(jit_state_t *_jit, int32_t r0, jit_word_t i1); + +static jit_reloc_t bmsr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bmsi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bmcr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bmci(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t boaddr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t boaddi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t boaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t boaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxaddr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxaddi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bosubr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bosubi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bosubr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bosubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxsubr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxsubi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxsubr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxsubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); + + +// Store operations +static void str_c(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_uc(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_s(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_i(jit_state_t *_jit, int32_t r0, int32_t r1); +#if __WORDSIZE == 64 +static void str_i(jit_state_t *_jit, int32_t r0, int32_t r1); +#endif + +static void sti_c(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void sti_s(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void sti_i(jit_state_t *_jit, jit_word_t i0, int32_t r0); +#if __WORDSIZE == 64 +static void sti_l(jit_state_t *_jit, jit_word_t i0, int32_t r0); +#endif + +static void stxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +# if __WORDSIZE == 64 +static void stxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +#endif + +static void stxi_c(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +static void stxi_s(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +static void stxi_i(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +# if __WORDSIZE == 64 +static void stxi_l(jit_state_t *_jit,jit_word_t i0,int32_t r0,int32_t r1); +# endif + + +// Load operations +static void ldr_c(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_uc(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_s(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_us(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_i(jit_state_t *_jit, int32_t r0, int32_t r1); +# if __WORDSIZE == 64 +static void ldr_ui(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_l(jit_state_t *_jit, int32_t r0, int32_t r1); +# endif + +static void ldi_c(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_uc(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_s(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_us(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_i(jit_state_t *_jit, int32_t r0, jit_word_t i0); +# if __WORDSIZE == 64 +static void ldi_ui(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_l(jit_state_t *_jit, int32_t r0, jit_word_t i0); +# endif + +static void ldxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_uc(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_us(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +# if __WORDSIZE == 64 +static void ldxr_ui(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +#endif + +static void ldxi_c(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_uc(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_us(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_s(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_i(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +# if __WORDSIZE == 64 +static void ldxi_ui(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_l(jit_state_t *_jit,int32_t r0,int32_t r1,jit_word_t i0); +#endif + + +// Argument management +//static void pushr(jit_state_t *_jit, int32_t r0); +//static void popr(jit_state_t *_jit, int32_t r0); +static void ret(jit_state_t *_jit); +static void retr(jit_state_t *_jit, int32_t r0); +static void reti(jit_state_t *_jit, jit_word_t i0); +static void retval_c(jit_state_t *_jit, int32_t r0); +static void retval_uc(jit_state_t *_jit, int32_t r0); +static void retval_s(jit_state_t *_jit, int32_t r0); +static void retval_us(jit_state_t *_jit, int32_t r0); +static void retval_i(jit_state_t *_jit, int32_t r0); +# if __WORDSIZE == 64 +static void retval_ui(jit_state_t *_jit, int32_t r0); +static void retval_l(jit_state_t *_jit, int32_t r0); +#endif + +// Jump and return +static uint32_t patch_jump(uint32_t inst, int32_t offset); +static jit_reloc_t emit_jump(jit_state_t *_jit, uint32_t inst); + +static void callr(jit_state_t *_jit, int32_t r0); +static void calli(jit_state_t *_jit, jit_word_t i0); +static void jmpi_with_link(jit_state_t *_jit, jit_word_t i0); +static void pop_link_register(jit_state_t *_jit); +static void push_link_register(jit_state_t *_jit); +static void jmpr(jit_state_t *_jit, int32_t r0); +static void jmpi(jit_state_t *_jit, jit_word_t i0); +static jit_reloc_t jmp(jit_state_t *_jit); + + +// Atomic operations +static void ldr_atomic(jit_state_t *_jit, int32_t dst, int32_t loc); +static void str_atomic(jit_state_t *_jit, int32_t loc, int32_t val); +static void swap_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, + int32_t val); +static void cas_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, + int32_t expected, int32_t desired); + +// Byte swapping operations +static void bswapr_us(jit_state_t *_jit, int32_t r0, int32_t r1); +static void bswapr_ui(jit_state_t *_jit, int32_t r0, int32_t r1); +# if __WORDSIZE == 64 +static void +bswapr_ul(jit_state_t *_jit, int32_t r0, int32_t r1); +#endif + +// Others +static void nop(jit_state_t *_jit, int32_t im); +static void mfence(jit_state_t *_jit); +static void breakpoint(jit_state_t *_jit); + + + +/* + * Binary ALU operations + */ +static void +addr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _ADD(r0, r1, r2)); +} +static void +addi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _ADDI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + addr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +addcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + // TODO: Not sure if this is correct + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), jit_gpr_regno(t0), r1)); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addr(_jit, r0, r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r0, r1)); + } +} + +static void +addci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), jit_gpr_regno(t0), r1)); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addi(_jit, r0, r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r0, r1)); + } +} + +static void +addxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + addcr(_jit, r0, r1, r2); + addcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +addxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + addci(_jit, r0, r1, i0); + addcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +subr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SUB(r0, r1, r2)); +} + +static void +subi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + addi(_jit, r0, r1, -i0); +} + +static void +subcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + subr(_jit, jit_gpr_regno(t0), r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addr(_jit, r0, r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, r0)); + } +} + +static void +subci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + subi(_jit, jit_gpr_regno(t0), r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addi(_jit, r0, r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, r0)); + } +} + +static void +subxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + subcr(_jit, r0, r1, r2); + subcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +subxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + subci(_jit, r0, r1, i0); + subcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +muli(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + mulr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +mulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _MUL(r0, r1, r2)); +} + +static void +divr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _DIV(r0, r1, r2)); +} + +static void +divi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + divr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +divr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _DIVU(r0, r1, r2)); +} + +static void +divi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + divr_u(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +remi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + remr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +remr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _REM(r0, r1, r2)); +} +static void +remi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + remr_u(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +remr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _REMU(r0, r1, r2)); +} + +static void +andr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _AND(r0, r1, r2)); +} + +static void +andi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _ANDI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + em_wp(_jit, _AND(r0, r1, jit_gpr_regno(t0))); + unget_temp_gpr(_jit); + } +} + +static void +orr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _OR(r0, r1, r2)); +} + +static void +ori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _ORI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + orr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +xorr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _XOR(r0, r1, r2)); +} + +static void +xori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _XORI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + xorr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +lshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SLL(r0, r1, r2)); +} + +static void +lshi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _SLLI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + lshr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +rshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SRA(r0, r1, r2)); +} + +static void +rshi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _SRAI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + rshr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +rshr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SRL(r0, r1, r2)); +} + +static void +rshi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _SRLI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + rshr_u(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + + +/* + * Four operand ALU operations + */ +static void +iqmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3, + jit_bool_t sign){ + if(r0 == r2 || r0 == r3){ + jit_gpr_t t0 = get_temp_gpr(_jit); + em_wp(_jit, _MUL(jit_gpr_regno(t0), r2, r3)); + if(sign) + em_wp(_jit, _MULH(r1, r2, r3)); + else + em_wp(_jit, _MULHU(r1, r2, r3)); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + em_wp(_jit, _MUL(r0, r2, r3)); + if(sign) + em_wp(_jit, _MULH(r1, r2, r3)); + else + em_wp(_jit, _MULHU(r1, r2, r3)); +} + +static void +qmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqmulr(_jit, r0, r1, r2, r3, 1); +} + +static void +qmulr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqmulr(_jit, r0, r1, r2, r3, 0); +} + +static void +qmuli(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqmulr(_jit, r0, r1, r2, jit_gpr_regno(t0), 1); + unget_temp_gpr(_jit); +} + +static void +qmuli_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqmulr(_jit, r0, r1, r2, jit_gpr_regno(t0), 0); + unget_temp_gpr(_jit); +} + +static void +iqdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3, + jit_bool_t sign){ + if(r0 == r2 || r0 == r3){ + jit_gpr_t t0 = get_temp_gpr(_jit); + if(sign){ + em_wp(_jit, _DIV(jit_gpr_regno(t0), r2, r3)); + em_wp(_jit, _REM(r1, r2, r3)); + } else { + em_wp(_jit, _DIVU(jit_gpr_regno(t0), r2, r3)); + em_wp(_jit, _REMU(r1, r2, r3)); + } + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + if(sign){ + em_wp(_jit, _DIV(r0, r2, r3)); + em_wp(_jit, _REM(r1, r2, r3)); + } else { + em_wp(_jit, _DIVU(r0, r2, r3)); + em_wp(_jit, _REMU(r1, r2, r3)); + } +} + +static void +qdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqdivr(_jit, r0, r1, r2, r3, 1); +} + +static void +qdivr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqdivr(_jit, r0, r1, r2, r3, 0); +} + +static void +qdivi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqdivr(_jit, r0, r1, r2, jit_gpr_regno(t0), 1); + unget_temp_gpr(_jit); +} + +static void +qdivi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqdivr(_jit, r0, r1, r2, jit_gpr_regno(t0), 0); + unget_temp_gpr(_jit); +} + + +/* + * Unary ALU operations + */ +static void +negr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _NEG(r0, r1)); +} + +static void +comr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _NOT(r0, r1)); +} + + +/* + * Branch instructions + */ +static uint32_t +patch_cc_jump(uint32_t inst, int32_t offset){ + instr_t i; + i.w = inst; + i.B.imm11 = (offset >> 11) & 0x1; + i.B.imm4_1 = (offset >> 1) & 0xf; + i.B.imm10_5 = (offset >> 5) & 0x3f; + i.B.imm12 = (offset >> 12) & 0x1; + return i.w; +} + +static jit_reloc_t +emit_cc_jump(jit_state_t *_jit, uint32_t inst) +{ + while (1) { + uint8_t *pc_base = _jit->pc.uc; // Offset is from current PC + int32_t off = (uint8_t*)jit_address(_jit) - pc_base; + jit_reloc_t ret = + jit_reloc (_jit, JIT_RELOC_JCC_WITH_VENEER, 0, _jit->pc.uc, pc_base, 0); + uint8_t cc_jump_width = 12; + if (add_pending_literal(_jit, ret, cc_jump_width - 1)) { + em_wp(_jit, patch_cc_jump(inst, off)); + return ret; + } + } +} + +static jit_reloc_t +bltr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BLT(r0, r1, 0)); +} + +static jit_reloc_t +blti(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bltr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BLTU(r0, r1, 0)); +} + +static jit_reloc_t +blti_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bltr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGE(r1, r0, 0)); +} + +static jit_reloc_t +blei(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bler(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGEU(r1, r0, 0)); +} + +static jit_reloc_t +blei_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bler_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +beqr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BEQ(r0, r1, 0)); +} + +static jit_reloc_t +beqi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = beqr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGE(r0, r1, 0)); +} + +static jit_reloc_t +bgei(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bger(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGEU(r0, r1, 0)); +} + +static jit_reloc_t +bgei_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bger_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bgtr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr(_jit, r1, r0); +} + +static jit_reloc_t +bgti(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bgtr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bgtr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr_u(_jit, r1, r0); +} + +static jit_reloc_t +bgti_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bgtr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bner(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BNE(r0, r1, 0)); +} + +static jit_reloc_t +bnei(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bner(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmsr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andr(_jit, jit_gpr_regno(t0), r0, r1); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmsi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andi(_jit, jit_gpr_regno(t0), r0, i0); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmcr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andr(_jit, jit_gpr_regno(t0), r0, r1); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmci(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andi(_jit, jit_gpr_regno(t0), r0, i0); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + // NOTE: We need tons of temporaries because RISC-V doesn't provide any + // easy way to solve this. We need to do it in software. + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = boaddr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + + jit_reloc_t ret = bnei(_jit, jit_gpr_regno(t1), 0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = boaddr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxaddr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + + jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxaddr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bosubr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 1); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bosubr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxsubr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxsubr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + + +/* + * Transfer operations + */ +static void +movr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + if (r0 != r1) + em_wp(_jit, _MV(r0, r1)); +} + +static void +movi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + int32_t srcreg = jit_gpr_regno(_ZERO); + if (simm32_p(i0)){ + + int64_t hi = ((i0 + 0x800) >> 12) & 0xFFFFF; + int64_t lo = (int32_t)i0<<20>>20; + + if(hi){ + em_wp(_jit, _LUI(r0, hi)); + srcreg = r0; + } + + if(lo || hi == 0){ + em_wp(_jit, _ADDI(r0, srcreg, lo)); + } + + } else { + // 64 bits: load in various steps + // lui, addi, slli, addi, slli, addi, slli, addi + int64_t hh = (i0>>44); + int64_t hl = (i0>>33) - (hh<<11); + int64_t lh = (i0>>22) - ((hh<<22) + (hl<<11)); + int64_t lm = (i0>>11) - ((hh<<33) + (hl<<22) + (lh<<11)); + int64_t ll = i0 - ((hh<<44) + (hl<<33) + (lh<<22) + (lm<<11)); + + + em_wp(_jit, _LUI(r0, hh)); + em_wp(_jit, _SLLI(r0, r0, 32)); + em_wp(_jit, _SRLI(r0, r0, 33)); + em_wp(_jit, _ADDI(r0, r0, hl)); + + em_wp(_jit, _SLLI(r0, r0, 11)); + em_wp(_jit, _ADDI(r0, r0, lh)); + + em_wp(_jit, _SLLI(r0, r0, 11)); + em_wp(_jit, _ADDI(r0, r0, lm)); + + em_wp(_jit, _SLLI(r0, r0, 11)); + em_wp(_jit, _ADDI(r0, r0, ll)); + } +} + +typedef union{ + struct{ + instr_t auipc; + instr_t load; // `ld` in RV64 and `lw` in RV32 + } inst; + uint64_t l; +} load_from_pool_t; + +static uint64_t +patch_load_from_pool(uint64_t instrs, int32_t off){ + + load_from_pool_t out, in; + int32_t hi20 = off >>12; + in.l = instrs; + out.inst.auipc.w = _AUIPC(in.inst.auipc.U.rd, hi20); + out.inst.load.w = Itype(in.inst.load.I.opcode, // `ld` or `lw` + in.inst.load.I.rd, + in.inst.load.I.funct3, + in.inst.load.I.rs1, + off - (hi20<<12)); + return out.l; +} + +static jit_reloc_t +emit_load_from_pool(jit_state_t *_jit, uint64_t insts) +{ + while (1) { + uint8_t *pc_base = _jit->pc.uc; // Offset is from current PC + int32_t off = (_jit->pc.uc - pc_base); + jit_reloc_t ret = + jit_reloc (_jit, JIT_RELOC_LOAD_FROM_POOL, 0, _jit->pc.uc, pc_base, 0); + uint8_t load_from_pool_width = 32; + if (add_pending_literal(_jit, ret, load_from_pool_width)) { + emit_u64(_jit, patch_load_from_pool(insts, off)); + return ret; + } + } +} +static jit_reloc_t +movi_from_pool(jit_state_t *_jit, int32_t r0) +{ + load_from_pool_t insts; + insts.inst.auipc.w = _AUIPC(r0, 0); +#if __WORDSIZE == 64 + insts.inst.load.w = _LD(r0, r0, 0); +#elif __WORDSIZE == 32 + insts.inst.load.w = _LW(r0, r0, 0); +#endif + return emit_load_from_pool(_jit, insts.l); +} +static jit_reloc_t +mov_addr(jit_state_t *_jit, int32_t r0) +{ + return movi_from_pool(_jit, r0); +} + + +static void +extr_c(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 8; + lshi(_jit, r0, r1, rot); + rshi(_jit, r0, r0, rot); +} + +static void +extr_uc(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 8; + lshi(_jit, r0, r1, rot); + rshi_u(_jit, r0, r0, rot); +} + +static void +extr_s(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 16; + lshi(_jit, r0, r1, rot); + rshi(_jit, r0, r0, rot); +} + +static void +extr_us(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 16; + lshi(_jit, r0, r1, rot); + rshi_u(_jit, r0, r0, rot); +} + +# if __WORDSIZE == 64 +static void +extr_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 32; + lshi(_jit, r0, r1, rot); + rshi(_jit, r0, r0, rot); +} +static void +extr_ui(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 32; + lshi(_jit, r0, r1, rot); + rshi_u(_jit, r0, r0, rot); +} +#endif + +/* + * Store operations + */ +static void +str_c(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SB(r0, r1, 0)); +} +static void +str_uc(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SB(r0, r1, 0)); +} +static void +str_s(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SH(r0, r1, 0)); +} +static void +str_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SW(r0, r1, 0)); +} +#if __WORDSIZE == 64 +static void +str_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SD(r0, r1, 0)); +} +#endif + +static void +sti_c(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_c(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} + +static void +sti_s(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_s(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} + +static void +sti_i(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_i(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} + +#if __WORDSIZE == 64 +static void +sti_l(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_l(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} +#endif + +static void +stxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_c(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} + +static void +stxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_s(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} + +static void +stxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_i(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} + +# if __WORDSIZE == 64 +static void +stxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_l(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} +#endif + +static void +stxi_c(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SB(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_c(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + + +static void +stxi_s(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SH(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_s(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + + +static void +stxi_i(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SW(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_i(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + +# if __WORDSIZE == 64 +static void +stxi_l(jit_state_t *_jit,jit_word_t i0,int32_t r0,int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_l(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} +# endif + + +/* + * Load operations + */ +static void +ldr_c(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LB(r0, r1, 0)); +} + +static void +ldr_uc(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LBU(r0, r1, 0)); +} + +static void +ldr_s(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LH(r0, r1, 0)); +} + +static void +ldr_us(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LHU(r0, r1, 0)); +} + +static void +ldr_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LW(r0, r1, 0)); +} + +# if __WORDSIZE == 64 +static void +ldr_ui(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LWU(r0, r1, 0)); +} + +static void +ldr_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LD(r0, r1, 0)); +} +# endif + + +static void +ldi_c(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_c(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_uc(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_uc(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_s(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_s(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_us(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_us(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + + +static void +ldi_i(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_i(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +# if __WORDSIZE == 64 +static void +ldi_ui(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_ui(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_l(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_l(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +#endif + + + + +static void +ldxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_c(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_uc(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_uc(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_s(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_us(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_us(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_i(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +# if __WORDSIZE == 64 +static void +ldxr_ui(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_ui(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_l(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +#endif + + + + +static void +ldxi_c(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_c(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_uc(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_uc(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_us(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_us(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_s(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_s(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_i(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_i(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +# if __WORDSIZE == 64 +static void +ldxi_ui(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_ui(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_l(jit_state_t *_jit,int32_t r0,int32_t r1,jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_l(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +#endif + + +/* + * Argument management + */ + +// static void +// pushr(jit_state_t *_jit, int32_t r0) +// { +// #if __WORDSIZE == 64 +// addi(jit_gpr_regno(_SP), -8); +// em_wp(_SD(r0, jit_gpr_regno(_SP), 0)); +// #elif __WORDSIZE == 32 +// addi(jit_gpr_regno(_SP), -4); +// em_wp(_SW(r0, jit_gpr_regno(_SP), 0)); +// #endif +// } +// static void +// popr(jit_state_t *_jit, int32_t r0) +// { +// #if __WORDSIZE == 64 +// em_wp(_jit, _LD(r0, jit_gpr_regno(_SP), 0)); +// addi(jit_gpr_regno(_SP), 8); +// #elif __WORDSIZE == 32 +// em_wp(_jit, _LW(r0, jit_gpr_regno(_SP), 0)); +// addi(jit_gpr_regno(_SP), 4); +// #endif +// } + +static void +ret(jit_state_t *_jit) +{ + em_wp(_jit, _RET()); +} + +static void +retr(jit_state_t *_jit, int32_t r0) +{ + movr(_jit, jit_gpr_regno(_A0), r0); + ret(_jit); +} + +static void +reti(jit_state_t *_jit, jit_word_t i0) +{ + movi(_jit, jit_gpr_regno(_A0), i0); + ret(_jit); +} + +static void +retval_c(jit_state_t *_jit, int32_t r0) +{ + extr_c(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_uc(jit_state_t *_jit, int32_t r0) +{ + extr_uc(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_s(jit_state_t *_jit, int32_t r0) +{ + extr_s(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_us(jit_state_t *_jit, int32_t r0) +{ + extr_us(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_i(jit_state_t *_jit, int32_t r0) +{ + extr_i(_jit, r0, jit_gpr_regno(_A0)); +} + +# if __WORDSIZE == 64 +static void +retval_ui(jit_state_t *_jit, int32_t r0) +{ + extr_ui(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_l(jit_state_t *_jit, int32_t r0) +{ + movr(_jit, r0, jit_gpr_regno(_A0)); +} +#endif + +/* + * Jump and return instructions + */ +static uint32_t +patch_jump(uint32_t inst, int32_t offset) +{ + instr_t i; + i.w = inst; + i.J.imm20 = (offset >> 20) & 0x1; + i.J.imm19_12= (offset >> 12) & 0xff; + i.J.imm11 = (offset >> 11) & 0x1; + i.J.imm10_1 = (offset >> 1) & 0x3ff; + return i.w; +} +static jit_reloc_t +emit_jump(jit_state_t *_jit, uint32_t inst) +{ + while (1) { + uint8_t *pc_base = _jit->pc.uc; // Offset is from current PC + int32_t off = (uint8_t*)jit_address(_jit) - pc_base; + jit_reloc_t ret = + jit_reloc (_jit, JIT_RELOC_JMP_WITH_VENEER, 0, _jit->pc.uc, pc_base, 0); + uint8_t jump_width = 20; + if (add_pending_literal(_jit, ret, jump_width - 1)) { + em_wp(_jit, patch_jump(inst, off)); + return ret; + } + } +} + +static void +callr(jit_state_t *_jit, int32_t r0) +{ + em_wp(_jit, _JALR(jit_gpr_regno(_RA), r0, 0)); +} + +static void +calli(jit_state_t *_jit, jit_word_t i0) +{ + jit_word_t jumpoffset = i0 - (jit_word_t)(_jit->pc.uc); + if (simm20_p(jumpoffset)){ + em_wp(_jit, _JAL(jit_gpr_regno(_RA), jumpoffset)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + callr(_jit, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +jmpi_with_link(jit_state_t *_jit, jit_word_t i0) +{ + calli(_jit, i0); +} + +static void +pop_link_register(jit_state_t *_jit) +{ +} + +static void +push_link_register(jit_state_t *_jit) +{ +} + +static void +jmpr(jit_state_t *_jit, int32_t r0) +{ + em_wp(_jit, _JALR(jit_gpr_regno(_ZERO), r0, 0)); +} + +static void +jmpi(jit_state_t *_jit, jit_word_t i0) +{ + jit_word_t jumpoffset = i0 - (jit_word_t)(_jit->pc.uc); + if (simm20_p(jumpoffset)){ + em_wp(_jit, _JAL(jit_gpr_regno(_ZERO), jumpoffset)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jmpr(_jit, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static jit_reloc_t +jmp(jit_state_t *_jit) +{ + return emit_jump(_jit, _JAL(jit_gpr_regno(_ZERO), 0)); +} + + + +/* + * Atomic operations + */ + +static void +ldr_atomic(jit_state_t *_jit, int32_t dst, int32_t loc) +{ + em_wp(_jit, _FENCE(0xFF)); + ldr_i(_jit, dst, loc); + em_wp(_jit, _FENCE(0xFF)); +} + +static void +str_atomic(jit_state_t *_jit, int32_t loc, int32_t val) +{ + em_wp(_jit, _FENCE(0xFF)); + str_i(_jit, loc, val); + em_wp(_jit, _FENCE(0xFF)); +} + +static void +swap_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, int32_t val) +{ +#if __WORDSIZE == 64 + em_wp(_jit, _AMOSWAP_D(dst, loc, val, 1, 1)); +#elif __WORDSIZE == 32 + em_wp(_jit, _AMOSWAP_W(dst, loc, val, 1, 1)); +#endif +} + +static void +cas_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, int32_t expected, + int32_t desired) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + void *retry = jit_address(_jit); + +#if __WORDSIZE == 64 + em_wp(_jit, _LR_D(t0, loc, 0,0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _LR_W(t0, loc, 0,0)); +#endif + + jit_reloc_t fail = bner(_jit, t0, expected); + +#if __WORDSIZE == 64 + em_wp(_jit, _SC_D(t1, desired, loc, 0,0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _SC_W(t1, desired, loc, 0,0)); +#endif + + jit_patch_there(_jit, bner(_jit, t1, jit_gpr_regno(_ZERO)), retry); + + jit_patch_here(_jit, fail); + em_wp(_jit, _FENCE(0xFF)); + movr(_jit, dst, t0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); +} + + +/* + * Byte swapping operations + * RISC-V Doesn't provide them by default. + * There's a B extension (Standard Extension for Bit Manipulation) draft, but + * it's not official yet: + * https://github.com/riscv/riscv-bitmanip + * Meanwhile, we need to implement them in software. + */ +static void +bswapr_uany(jit_state_t *_jit, int32_t r0, int32_t r1, size_t size) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + andi(_jit, r0, r1, 0xFF); + for(int i = 1; i < size; i++){ + lshi(_jit, r0, r0, 8); + rshi(_jit, t0, r1, 8*i); + andi(_jit, t0, t0, 0xFF); + orr(_jit, r0, r0, t0); + } + unget_temp_gpr(_jit); +} + +static void +bswapr_us(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + bswapr_uany(_jit, r0, r1, 2); +} + +static void +bswapr_ui(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + bswapr_uany(_jit, r0, r1, 4); +} + +# if __WORDSIZE == 64 +static void +bswapr_ul(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + bswapr_uany(_jit, r0, r1, 8); +} +#endif + + + +/* + * Others + * TODO + */ +static void +nop(jit_state_t *_jit, int32_t im) +{ + for (; im > 0; im -= 4) + em_wp(_jit, _NOP()); + assert(im == 0); +} +static void +mfence(jit_state_t *_jit) +{ + // TODO: we may need it for atomic operations? +} + +static void +breakpoint(jit_state_t *_jit) +{ + em_wp(_jit, _EBREAK()); +} diff --git a/lightening/riscv-fpu.c b/lightening/riscv-fpu.c new file mode 100644 index 000000000..315ed8d14 --- /dev/null +++ b/lightening/riscv-fpu.c @@ -0,0 +1,858 @@ +/* + * RV32F Standard Extension + */ +#define _FLW(rd, rs1, im) Itype(7, rd, 2, rs1, im) +#define _FSW(rs1, rs2, imm) Stype(39, 2, rs1, rs2, imm) +#define _FMADD_S(rd, rs1, rs2, rs3) R4type(67, rd, 0, rs1, rs2, 0, rs3) +#define _FMSUB_S(rd, rs1, rs2, rs3) R4type(71, rd, 0, rs1, rs2, 0, rs3) +#define _FNMSUB_S(rd, rs1, rs2, rs3) R4type(75, rd, 0, rs1, rs2, 0, rs3) +#define _FNMADD_S(rd, rs1, rs2, rs3) R4type(79, rd, 0, rs1, rs2, 0, rs3) +#define _FADD_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 0) +#define _FSUB_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 4) +#define _FMUL_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 8) +#define _FDIV_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 12) +#define _FSQRT_S(rd, rs1) Rtype(83, rd, 0, rs1, 0, 44) +#define _FSGNJ_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 16) +#define _FSGNJN_S(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 16) +#define _FSGNJX_S(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 16) +#define _FMIN_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 20) +#define _FMAX_S(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 20) +#define _FCVT_W_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 96) +#define _FCVT_WU_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 96) +#define _FMV_X_W(rd, rs1) Rtype(83, rd, 0, rs1, 0, 112) +#define _FEQ_S(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 80) +#define _FLT_S(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 80) +#define _FLE_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 80) +#define _FCLASS_S(rd, rs1) Rtype(83, rd, 1, rs1, 0, 112) +#define _FCVT_S_W(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 104) +#define _FCVT_S_WU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 104) +#define _FMV_W_X(rd, rs1) Rtype(83, rd, 0, rs1, 0, 120) +/* + * RV64F Standard Extension (in addition to RV32F) + */ +#define _FCVT_L_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 96) +#define _FCVT_LU_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 96) +#define _FCVT_S_L(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 104) +#define _FCVT_S_LU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 104) +/* + * RV32D Standard Extension + */ +#define _FLD(rd, rs1, im) Itype(7, rd, 3, rs1, im) +#define _FSD(rs1, rs2, imm) Stype(39, 3, rs1, rs2, imm) +#define _FMADD_D(rd, rs1, rs2, rs3) R4type(67, rd, 0, rs1, rs2, 1, rs3) +#define _FMSUB_D(rd, rs1, rs2, rs3) R4type(71, rd, 0, rs1, rs2, 1, rs3) +#define _FNMSUB_D(rd, rs1, rs2, rs3) R4type(75, rd, 0, rs1, rs2, 1, rs3) +#define _FNMADD_D(rd, rs1, rs2, rs3) R4type(79, rd, 0, rs1, rs2, 1, rs3) +#define _FADD_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 1) +#define _FSUB_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 5) +#define _FMUL_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 9) +#define _FDIV_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 13) +#define _FSQRT_D(rd, rs1) Rtype(83, rd, 0, rs1, 0, 45) +#define _FSGNJ_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 17) +#define _FSGNJN_D(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 17) +#define _FSGNJX_D(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 17) +#define _FMIN_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 21) +#define _FMAX_D(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 21) +#define _FCVT_S_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 32) +#define _FCVT_D_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 33) +#define _FEQ_D(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 81) +#define _FLT_D(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 81) +#define _FLE_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 81) +#define _FCLASS_D(rd, rs1) Rtype(83, rd, 1, rs1, 0, 113) +#define _FCVT_W_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 97) +#define _FCVT_WU_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 97) +#define _FCVT_D_W(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 105) +#define _FCVT_D_WU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 105) +/* + * RV64D Standard Extension (in addition to RV32D) + */ +#define _FCVT_L_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 97) +#define _FCVT_LU_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 97) +#define _FMV_X_D(rd, rs1) Rtype(83, rd, 0, rs1, 0, 113) +#define _FCVT_D_L(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 105) +#define _FCVT_D_LU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 105) +#define _FMV_D_X(rd, rs1) Rtype(83, rd, 0, rs1, 0, 121) +/* + * Pseudo instructions + */ +#define _FMV_S(r0, r1) _FSGNJ_S(r0, r1, r1) +#define _FABS_S(r0, r1) _FSGNJX_S(r0, r1, r1) +#define _FNEG_S(r0, r1) _FSGNJN_S(r0, r1, r1) +#define _FMV_D(r0, r1) _FSGNJ_D(r0, r1, r1) +#define _FABS_D(r0, r1) _FSGNJX_D(r0, r1, r1) +#define _FNEG_D(r0, r1) _FSGNJN_D(r0, r1, r1) + +// Binary ALU operations +static void addr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void mulr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void mulr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); + +// Unary ALU operations +static void sqrtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void sqrtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void negr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void negr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void absr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void absr_d(jit_state_t *_jit, int32_t r0, int32_t r1); + +// Transfer operations +static void movr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movr_d(jit_state_t *_jit, int32_t r0, int32_t r1); + +// Argument management +static void retr_f(jit_state_t *_jit, int32_t u); +static void retr_d(jit_state_t *_jit, int32_t u); + +// Load operations +static void ldr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldi_f(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxi_f(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldi_d(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxi_d(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +// Store operations +static void str_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void sti_f(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void stxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxi_f(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +static void sti_d(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void stxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxi_d(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); + +// Branch instructions +static jit_reloc_t bltr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bler_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t beqr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bger_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bner_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunltr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunler_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t buneqr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunger_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bungtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bltgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bordr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunordr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bltr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bler_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t beqr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bger_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bner_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunltr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunler_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t buneqr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunger_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bungtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bltgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bordr_d(jit_state_t *_jit, int32_t r0, int32_t r1); + +/* + * Binary ALU operations + */ +static void +addr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FADD_S(r0, r1, r2)); +} +static void +addr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FADD_D(r0, r1, r2)); +} +static void +subr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FSUB_S(r0, r1, r2)); +} +static void +subr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FSUB_D(r0, r1, r2)); +} +static void +mulr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FMUL_S(r0, r1, r2)); +} +static void +mulr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FMUL_D(r0, r1, r2)); +} +static void +divr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FDIV_S(r0, r1, r2)); +} +static void +divr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FDIV_D(r0, r1, r2)); +} + +/* + * Unary ALU operations + */ +static void +sqrtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSQRT_S(r0, r1)); +} +static void +sqrtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSQRT_D(r0, r1)); +} +static void +negr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FNEG_S(r0, r1)); +} +static void +negr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FNEG_D(r0, r1)); +} +static void +absr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FABS_S(r0, r1)); +} + +static void +absr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FABS_D(r0, r1)); +} + + +/* + * Load operations + */ +static void +ldr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FLW(r0, r1, 0)); +} +static void +ldr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FLD(r0, r1, 0)); +} +static void +ldi_f(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_f(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_f(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxi_f(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _FLW(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_f(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldi_d(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_d(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_d(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldxi_d(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _FLD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_d(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + + + +/* + * Store operations + */ +static void +str_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSW(r0, r1, 0)); +} +static void +str_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSD(r0, r1, 0)); +} +static void +sti_f(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_f(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} +static void +stxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_f(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} +static void +stxi_f(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _FSW(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_f(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} +static void +sti_d(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_d(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} +static void +stxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_d(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} +static void +stxi_d(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _FSD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_d(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + + +/* + * Transfer operations + */ +static void +movr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + if (r0 != r1) + em_wp(_jit, _FMV_S(r0, r1)); +} + +static void +movr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + if (r0 != r1) + em_wp(_jit, _FMV_D(r0, r1)); +} +static void +truncr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_W_S(r0, r1, 1)); +} +static void +truncr_d_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_W_D(r0, r1, 1)); +} +static void +truncr_f_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_L_S(r0, r1, 1)); +} +static void +truncr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_L_D(r0, r1, 1)); +} + +static void +extr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ +#if __WORDSIZE == 64 + em_wp(_jit, _FCVT_S_L(r0, r1, 0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _FCVT_S_W(r0, r1, 0)); +#endif +} +static void +extr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ +#if __WORDSIZE == 64 + em_wp(_jit, _FCVT_D_L(r0, r1, 0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _FCVT_D_W(r0, r1, 0)); +#endif +} + +static void +extr_f_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_D_S(r0, r1, 0)); +} +static void +extr_d_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_S_D(r0, r1, 0)); +} + +static void +movi_f(jit_state_t *_jit, int32_t r0, jit_float32_t i0) +{ + union { int32_t i; jit_float32_t f; } u = { .f = i0 }; + jit_gpr_t reg = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(reg), u.i); + em_wp(_jit, _FMV_W_X(r0, jit_gpr_regno(reg))); + unget_temp_gpr(_jit); +} +static void +movi_d(jit_state_t *_jit, int32_t r0, jit_float64_t i0) +{ + // TODO: How to move a 64 bit value from a 32 bit X register? + // ATM only works on RV64 + union { int64_t i; jit_float64_t f; } u = { .f = i0 }; + jit_gpr_t reg = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(reg), u.i); + em_wp(_jit, _FMV_D_X(r0, jit_gpr_regno(reg))); + unget_temp_gpr(_jit); +} + + +/* + * Argument management + */ +static void +retval_f(jit_state_t *_jit, int32_t r0) +{ + movr_f(_jit, jit_fpr_regno(_FA0), r0); +} + +static void +retval_d(jit_state_t *_jit, int32_t r0) +{ + movr_d(_jit, jit_fpr_regno(_FA0), r0); +} + +static void +retr_f(jit_state_t *_jit, int32_t u) +{ + movr_f(_jit, jit_fpr_regno(_FA0), u); + ret(_jit); +} + +static void +retr_d(jit_state_t *_jit, int32_t u) +{ + movr_d(_jit, jit_fpr_regno(_FA0), u); + ret(_jit); +} + + +/* + * Branch instructions + */ + +static jit_reloc_t +bltr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_S(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_S(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +beqr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_S(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bler_f(_jit, r1, r0); +} + +static jit_reloc_t +bgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr_f(_jit, r1, r0); +} + +static jit_reloc_t +bner_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_S(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunltr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_S(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunler_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_S(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +buneqr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_S(t0, r0, r1)); + em_wp(_jit, _FLT_S(t1, r1, r0)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunger_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_S(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bungtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_S(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_S(t0, r1, r0)); + em_wp(_jit, _FLT_S(t1, r0, r1)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bordr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_S(t0, r0, r0)); + em_wp(_jit, _FEQ_S(t1, r1, r1)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunordr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_S(t0, r1, r1)); + em_wp(_jit, _FEQ_S(t1, r0, r0)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_D(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_D(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +beqr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_D(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bler_d(_jit, r1, r0); +} + +static jit_reloc_t +bgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr_d(_jit, r1, r0); +} + +static jit_reloc_t +bner_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_D(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunltr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_D(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunler_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_D(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +buneqr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_D(t0, r0, r1)); + em_wp(_jit, _FLT_D(t1, r1, r0)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunger_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_D(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bungtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_D(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_D(t0, r1, r0)); + em_wp(_jit, _FLT_D(t1, r0, r1)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bordr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_D(t0, r0, r0)); + em_wp(_jit, _FEQ_D(t1, r1, r1)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunordr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_D(t0, r1, r1)); + em_wp(_jit, _FEQ_D(t1, r0, r0)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} diff --git a/lightening/riscv.c b/lightening/riscv.c new file mode 100644 index 000000000..eaac94a96 --- /dev/null +++ b/lightening/riscv.c @@ -0,0 +1,327 @@ +/* + * Copyright (C) 2021-2024 Free Software Foundation, Inc. + * + * This file is part of GNU lightning. + * + * GNU lightning is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU lightning is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * Authors: + * Ekaitz Zarraga + */ + +#include "riscv-cpu.c" +#include "riscv-fpu.c" + +static const jit_gpr_t abi_gpr_args[] = { + _A0, _A1, _A2, _A3, _A4, _A5, _A6, _A7 +}; +static const jit_fpr_t abi_fpr_args[] = { + _FA0, _FA1, _FA2, _FA3, _FA4, _FA5, _FA6, _FA7 +}; +static const int abi_gpr_arg_count = sizeof(abi_gpr_args) / sizeof(abi_gpr_args[0]); +static const int abi_fpr_arg_count = sizeof(abi_fpr_args) / sizeof(abi_fpr_args[0]); + +struct abi_arg_iterator +{ + const jit_operand_t *args; + size_t argc; + + size_t arg_idx; + size_t gpr_idx; + size_t fpr_idx; + uint32_t vfp_used_registers; + size_t stack_size; + size_t stack_padding; +}; + +static size_t page_size; + +jit_bool_t +jit_get_cpu(void) +{ + page_size = sysconf(_SC_PAGE_SIZE); + // FIXME check version, extensions, hardware fp support + // + // List of macro definitions for riscv support: + // ------------------------------------------- + // __riscv: defined for any RISC-V target. Older versions of the GCC + // toolchain defined __riscv__. + // + // __riscv_xlen: 32 for RV32 and 64 for RV64. + // + // __riscv_float_abi_soft, __riscv_float_abi_single, + // __riscv_float_abi_double: one of these three will be defined, depending on + // target ABI. + // + // __riscv_cmodel_medlow, __riscv_cmodel_medany: one of these two will be + // defined, depending on the target code model. + // + // __riscv_mul: defined when targeting the 'M' ISA extension. + // + // __riscv_muldiv: defined when targeting the 'M' ISA extension and -mno-div + // has not been used. + // + // __riscv_div: defined when targeting the 'M' ISA extension and -mno-div has + // not been used. + // + // __riscv_atomic: defined when targeting the 'A' ISA extension. + // + // __riscv_flen: 32 when targeting the 'F' ISA extension (but not 'D') and 64 + // when targeting 'FD'. + // + // __riscv_fdiv: defined when targeting the 'F' or 'D' ISA extensions and + // -mno-fdiv has not been used. + // + // __riscv_fsqrt: defined when targeting the 'F' or 'D' ISA extensions and + // -mno-fdiv has not been used. + // + // __riscv_compressed: defined when targeting the 'C' ISA extension. + return 1; +} + +jit_bool_t +jit_init(jit_state_t *_jit) +{ + return 1; +} + +static size_t +jit_initial_frame_size (void) +{ + return 0; +} + +static void +reset_abi_arg_iterator(struct abi_arg_iterator *iter, size_t argc, + const jit_operand_t *args) +{ + memset(iter, 0, sizeof *iter); + iter->argc = argc; + iter->args = args; +} + +static void +next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg) +{ + ASSERT(iter->arg_idx < iter->argc); + enum jit_operand_abi abi = iter->args[iter->arg_idx].abi; + iter->arg_idx++; + if (is_gpr_arg(abi) && iter->gpr_idx < abi_gpr_arg_count) { + *arg = jit_operand_gpr (abi, abi_gpr_args[iter->gpr_idx++]); + return; + } + if (is_fpr_arg(abi) && iter->fpr_idx < abi_fpr_arg_count) { + *arg = jit_operand_fpr (abi, abi_fpr_args[iter->fpr_idx++]); + return; + } + *arg = jit_operand_mem (abi, JIT_SP, iter->stack_size); +#if __WORDSIZE == 32 + iter->stack_size += 4; +#elif __WORDSIZE == 64 + iter->stack_size += 8; +#endif +} + +static void +jit_flush(void *fptr, void *tptr) +{ + jit_word_t f = (jit_word_t)fptr & -page_size; + jit_word_t t = (((jit_word_t)tptr) + page_size - 1) & -page_size; + __clear_cache((void *)f, (void *)t); +} + +static inline size_t +jit_stack_alignment(void) +{ + return 8; + // NOTE: See: https://github.com/riscv/riscv-gcc/issues/61 +} + +static void +jit_try_shorten(jit_state_t *_jit, jit_reloc_t reloc, jit_pointer_t addr) +{ +} + +static void* +bless_function_pointer(void *ptr) +{ + return ptr; +} + + +/* + * Veneers + */ +struct veneer{ + instr_t auipc; + instr_t load; // `ld` in RV64 and `lw` in RV32 + instr_t jalr; +#if __WORDSIZE == 64 + uint64_t address; +#elif __WORDSIZE == 32 + uint32_t address; +#endif +}; + +static void +emit_veneer(jit_state_t *_jit, jit_pointer_t target) +{ + // We need to generate something like this (RV64): + // ---------------------------------------------- + // auipc t0, 0 + // ld t0, 12(t0) + // jalr zero, 0(t0) + // ADDRESS_LITERAL + jit_gpr_t t0 = get_temp_gpr(_jit); + emit_u32(_jit, _AUIPC(jit_gpr_regno(t0), 0)); +#if __WORDSIZE == 64 + emit_u32(_jit, _LD(jit_gpr_regno(t0), jit_gpr_regno(t0), 12)); +#elif __WORDSIZE == 32 + emit_u32(_jit, _LW(jit_gpr_regno(t0), jit_gpr_regno(t0), 12)); +#endif + emit_u32(_jit, _JALR(jit_gpr_regno(_ZERO), jit_gpr_regno(t0), 0)); +#if __WORDSIZE == 64 + emit_u64(_jit, (uint64_t) target); +#elif __WORDSIZE == 32 + emit_u32(_jit, (uint32_t) target); +#endif + unget_temp_gpr(_jit); +} + +static void +patch_veneer(uint32_t *loc, jit_pointer_t addr) +{ + struct veneer *v = (struct veneer*) loc; +#if __WORDSIZE == 64 + v->address = (uint64_t) addr; +#elif __WORDSIZE == 32 + v->address = (uint32_t) addr; +#endif +} + + +/* + * Conditional jumps + */ +static void +patch_jcc_offset(uint32_t *loc, ptrdiff_t v) +{ + + instr_t *i = (instr_t *) loc; + i->B.imm11 = (v >> 11) & 0x1; + i->B.imm4_1 = (v >> 1) & 0xf; + i->B.imm10_5 = (v >> 5) & 0x3f; + i->B.imm12 = (v >> 12) & 0x1; +} +static void +patch_veneer_jcc_offset(uint32_t *loc, ptrdiff_t offset){ + patch_jcc_offset(loc, offset); +} + +static int32_t +read_jcc_offset(uint32_t *loc) +{ + instr_t i; + i.w = *loc; + + int32_t offset = i.B.imm12 << 31; + offset >>= 20; + offset |= (i.B.imm11 << 11); + offset |= (i.B.imm10_5 << 5); + offset |= (i.B.imm4_1 << 1); + + return offset; +} +static int +offset_in_jcc_range(ptrdiff_t offset, int flags) +{ + if(offset & 1) + return 0; + else + return -0x1000 <= offset && offset <= 0xFFF; +} + +/* + * Unconditional jumps + */ +static int32_t read_jmp_offset(uint32_t *loc) +{ + instr_t i; + i.w = *loc; + + int32_t offset = i.J.imm20 << 31; + offset >>= 12; + offset |= (i.J.imm19_12 << 12); + offset |= (i.J.imm11 << 11); + offset |= (i.J.imm10_1 << 1); + return offset; +} +static int +offset_in_jmp_range(ptrdiff_t offset, int flags) +{ + if(offset & 1) + return 0; + else + return -0x100000 <= offset && offset <= 0xFFFFF; +} + +static void +patch_jmp_offset(uint32_t *loc, ptrdiff_t v) +{ + instr_t *i = (instr_t *) loc; + i->J.imm20 = (v >> 20) & 0x1; + i->J.imm19_12= (v >> 12) & 0xff; + i->J.imm11 = (v >> 11) & 0x1; + i->J.imm10_1 = (v >> 1) & 0x3ff; +} + +static void +patch_veneer_jmp_offset(uint32_t *loc, ptrdiff_t offset) +{ + patch_jmp_offset(loc, offset); +} + + +/* + * Jumps around the veneer + */ +static void +patch_jmp_without_veneer(jit_state_t *_jit, uint32_t *loc) +{ + patch_jmp_offset(loc, _jit->pc.ui - loc); +} +static uint32_t* +jmp_without_veneer(jit_state_t *_jit) +{ + uint32_t *loc = _jit->pc.ui; + emit_u32(_jit, _JAL(jit_gpr_regno(_ZERO), 0)); + return loc; +} + + +/* + * Load from pool offset + */ +static void +patch_load_from_pool_offset(uint32_t *loc, int32_t v) +{ + load_from_pool_t *i = (load_from_pool_t *) loc; + int32_t hi20 = v >>12; + i->inst.auipc.U.imm31_12 = hi20; + i->inst.load.I.imm11_0 = v - (hi20<<12); +} +static int32_t +read_load_from_pool_offset(uint32_t *loc) +{ + load_from_pool_t *i = (load_from_pool_t*) loc; + return i->inst.auipc.U.imm31_12 + i->inst.load.I.imm11_0; +} + diff --git a/lightening/riscv.h b/lightening/riscv.h new file mode 100644 index 000000000..173216655 --- /dev/null +++ b/lightening/riscv.h @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2021-2024 Free Software Foundation, Inc. + * + * This file is part of GNU lightning. + * + * GNU lightning is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU lightning is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * Authors: + * Ekaitz Zarraga + */ + +#ifndef _jit_riscv_h +#define _jit_riscv_h + +#define JIT_NEEDS_LITERAL_POOL 1 + +// x registers +// Special registers +#define _RA JIT_GPR(1) // Return address +#define _SP JIT_GPR(2) // Stack pointer +#define _GP JIT_GPR(3) // Global pointer +#define _TP JIT_GPR(4) // Thread pointer +#define _FP JIT_GPR(8) // Frame pointer +#define _ZERO JIT_GPR(0) // Always zero +// Argument passing +#define _A0 JIT_GPR(10) +#define _A1 JIT_GPR(11) +#define _A2 JIT_GPR(12) +#define _A3 JIT_GPR(13) +#define _A4 JIT_GPR(14) +#define _A5 JIT_GPR(15) +#define _A6 JIT_GPR(16) +#define _A7 JIT_GPR(17) +// Saved registers +#define _S0 _FP // S0 is the frame pointer normally +#define _S1 JIT_GPR(9) +#define _S2 JIT_GPR(18) +#define _S3 JIT_GPR(19) +#define _S4 JIT_GPR(20) +#define _S5 JIT_GPR(21) +#define _S6 JIT_GPR(22) +#define _S7 JIT_GPR(23) +#define _S8 JIT_GPR(24) +#define _S9 JIT_GPR(25) +#define _S10 JIT_GPR(26) +#define _S11 JIT_GPR(27) +// Temporaries +#define _T0 JIT_GPR(5) +#define _T1 JIT_GPR(6) +#define _T2 JIT_GPR(7) +#define _T3 JIT_GPR(28) +#define _T4 JIT_GPR(29) +#define _T5 JIT_GPR(30) +#define _T6 JIT_GPR(31) + +// f registers +// Termporaries +#define _FT0 JIT_FPR(0) +#define _FT1 JIT_FPR(1) +#define _FT2 JIT_FPR(2) +#define _FT3 JIT_FPR(3) +#define _FT4 JIT_FPR(4) +#define _FT5 JIT_FPR(5) +#define _FT6 JIT_FPR(6) +#define _FT7 JIT_FPR(7) +#define _FT8 JIT_FPR(28) +#define _FT9 JIT_FPR(29) +#define _FT10 JIT_FPR(30) +#define _FT11 JIT_FPR(31) +// Saved registers +#define _FS0 JIT_FPR(8) +#define _FS1 JIT_FPR(9) +#define _FS2 JIT_FPR(18) +#define _FS3 JIT_FPR(19) +#define _FS4 JIT_FPR(20) +#define _FS5 JIT_FPR(21) +#define _FS6 JIT_FPR(22) +#define _FS7 JIT_FPR(23) +#define _FS8 JIT_FPR(24) +#define _FS9 JIT_FPR(25) +#define _FS10 JIT_FPR(26) +#define _FS11 JIT_FPR(27) +// Argument passing +#define _FA0 JIT_FPR(10) +#define _FA1 JIT_FPR(11) +#define _FA2 JIT_FPR(12) +#define _FA3 JIT_FPR(13) +#define _FA4 JIT_FPR(14) +#define _FA5 JIT_FPR(15) +#define _FA6 JIT_FPR(16) +#define _FA7 JIT_FPR(17) + + +// JIT Registers +// ---------------------------------------------------------------------- +// Caller-save registers JIT_R${NUM} +// Callee-save registers JIT_V${NUM} +// Caller-save temporary registers JIT_TMP${NUM} +// Caller-save floating point registers JIT_F${NUM} +// Callee-save floating point registers JIT_VF${NUM} +// Caller-save floating point temporary registers JIT_FTMP${NUM} + +// Caller-save registers +#define JIT_R0 _A0 +#define JIT_R1 _A1 +#define JIT_R2 _A2 +#define JIT_R3 _A3 +#define JIT_R4 _A4 +#define JIT_R5 _A5 +#define JIT_R6 _A6 +#define JIT_R7 _A7 + +// Use this as a CARRY +#define JIT_CARRY _T0 +#define JIT_TMP0 _T1 +#define JIT_TMP1 _T2 +#define JIT_TMP2 _T3 + +#define JIT_TMP3 _T4 +// Temporaries +#define JIT_TMP4 _T5 +#define JIT_TMP5 _T6 + +// Callee-save registers +#define JIT_V0 _S1 +#define JIT_V1 _S2 +#define JIT_V2 _S3 +#define JIT_V3 _S4 +#define JIT_V4 _S5 +#define JIT_V5 _S6 +#define JIT_V6 _S7 +#define JIT_V7 _S8 +#define JIT_V8 _S9 +#define JIT_V9 _S10 +#define JIT_V10 _S11 + + +// Callee-save floating point registers +#define JIT_VF0 _FS0 +#define JIT_VF1 _FS1 +#define JIT_VF2 _FS2 +#define JIT_VF3 _FS3 +#define JIT_VF4 _FS4 +#define JIT_VF5 _FS5 +#define JIT_VF6 _FS6 +#define JIT_VF7 _FS7 +#define JIT_VF8 _FS8 +#define JIT_VF9 _FS9 +#define JIT_VF10 _FS10 +#define JIT_VF11 _FS11 + +// Caller save floating point registers +#define JIT_F0 _FA0 +#define JIT_F1 _FA1 +#define JIT_F2 _FA2 +#define JIT_F3 _FA3 +#define JIT_F4 _FA4 +#define JIT_F5 _FA5 +#define JIT_F6 _FA6 +#define JIT_F7 _FA7 +// NOTE: These are temporaries, but we can use them as general purpose +// registers as there's only one temporary JIT_FTMP supported by lightening.c +#define JIT_F8 _FT0 +#define JIT_F9 _FT1 +#define JIT_F10 _FT2 +#define JIT_F11 _FT3 +#define JIT_F12 _FT4 +#define JIT_F13 _FT5 +#define JIT_F14 _FT6 +#define JIT_F15 _FT7 +#define JIT_F16 _FT8 +#define JIT_F17 _FT9 +#define JIT_F18 _FT10 + +// Floating point temporary register +#define JIT_FTMP _FT11 + +// Special purpose registers +#define JIT_FP _FP +#define JIT_LR _RA +#define JIT_SP _SP + +// TODO: Make sure this is correct +#define JIT_PLATFORM_CALLEE_SAVE_GPRS JIT_LR + +#endif