diff --git a/libguile/lightening/.gitlab-ci.yml b/libguile/lightening/.gitlab-ci.yml index a5e8694bf..4e4b40ff6 100644 --- a/libguile/lightening/.gitlab-ci.yml +++ b/libguile/lightening/.gitlab-ci.yml @@ -1,40 +1,60 @@ image: debian:stable before_script: - - dpkg --add-architecture i386 - - dpkg --add-architecture arm64 - - dpkg --add-architecture armhf - apt-get update -qq - - apt-get install -y - libc6-dev:amd64 gcc make - binfmt-support qemu-user-static - gcc-i686-linux-gnu libc6-dev-i386-cross libc6:i386 - gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6:arm64 - gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6:armhf - - update-binfmts --enable qemu-aarch64 - - update-binfmts --enable qemu-arm + - apt-get install -y make binfmt-support qemu-user-static x86-64: stage: test script: + - dpkg --add-architecture arm64 + - apt-get update -qq + - apt-get install -y libc6-dev:amd64 gcc - make -C tests test-native i686: stage: test script: + - dpkg --add-architecture i386 + - apt-get update -qq + - apt-get install -y gcc-i686-linux-gnu libc6-dev-i386-cross libc6:i386 - make -C tests test-ia32 CC_IA32=i686-linux-gnu-gcc aarch64: stage: test script: + - dpkg --add-architecture arm64 + - apt-get update -qq + - apt-get install -y gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6:arm64 - make -C tests test-aarch64 CC_AARCH64=aarch64-linux-gnu-gcc armhf: stage: test script: + - dpkg --add-architecture armhf + - apt-get update -qq + - apt-get install -y gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6:armhf - make -C tests test-armv7 CC_ARMv7="arm-linux-gnueabihf-gcc -marm" + armhf-thumb: stage: test script: + - dpkg --add-architecture armhf + - apt-get update -qq + - apt-get install -y gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6:armhf - make -C tests test-armv7 CC_ARMv7="arm-linux-gnueabihf-gcc -mthumb" + + +riscv: + stage: test + script: + - dpkg --add-architecture riscv64 + - apt-get update -qq + - apt-get install -y gcc-riscv64-linux-gnu + - echo /usr/local/lib/riscv64-linux-gnu >>/etc/ld.so.conf.d/riscv64-linux-gnu.conf + - echo /lib/riscv64-linux-gnu >>/etc/ld.so.conf.d/riscv64-linux-gnu.conf + - echo /usr/lib/riscv64-linux-gnu >>/etc/ld.so.conf.d/riscv64-linux-gnu.conf + - echo /usr/riscv64-linux-gnu/lib >>/etc/ld.so.conf.d/riscv64-linux-gnu.conf + - ln -s /usr/riscv64-linux-gnu/lib/ld-linux-riscv64-lp64d.so.1 /lib + - make -C tests test-riscv CC_RISCV="riscv64-linux-gnu-gcc -static" diff --git a/libguile/lightening/lightening.am b/libguile/lightening/lightening.am index 2c9089ead..ba55f2c7f 100644 --- a/libguile/lightening/lightening.am +++ b/libguile/lightening/lightening.am @@ -40,6 +40,7 @@ lightening_extra_files = \ $(lightening)/lightening/mips.h \ $(lightening)/lightening/ppc.h \ $(lightening)/lightening/x86.h \ + $(lightening)/lightening/riscv.h \ \ $(lightening)/lightening/aarch64.c \ $(lightening)/lightening/aarch64-cpu.c \ @@ -55,4 +56,7 @@ lightening_extra_files = \ $(lightening)/lightening/ppc-fpu.c \ $(lightening)/lightening/x86.c \ $(lightening)/lightening/x86-cpu.c \ - $(lightening)/lightening/x86-sse.c + $(lightening)/lightening/x86-sse.c \ + $(lightening)/lightening/riscv.c \ + $(lightening)/lightening/riscv-cpu.c \ + $(lightening)/lightening/riscv-fpu.c diff --git a/libguile/lightening/lightening.h b/libguile/lightening/lightening.h index efa5dfdf1..b364e18cc 100644 --- a/libguile/lightening/lightening.h +++ b/libguile/lightening/lightening.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2020 Free Software Foundation, Inc. + * Copyright (C) 2012-2020, 2025 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -77,6 +77,8 @@ jit_same_fprs (jit_fpr_t a, jit_fpr_t b) # include "lightening/aarch64.h" #elif defined(__s390__) || defined(__s390x__) # include "lightening/s390.h" +#elif defined(__riscv__) || defined(__riscv) +# include "lightening/riscv.h" #endif enum jit_reloc_kind @@ -622,6 +624,10 @@ jit_load_args_3(jit_state_t *_jit, jit_operand_t a, jit_operand_t b, M(_FF__, extr_f_d) \ M(_FF__, movr_f) \ M(_FF__, movr_d) \ + M(_GF__, movr_i_f) \ + M(_FG__, movr_f_i) \ + WHEN_64(M(_GF__, movr_l_d)) \ + WHEN_64(M(_FG__, movr_d_l)) \ M(_Ff__, movi_f) \ M(_Fd__, movi_d) \ M(_GF__, truncr_d_i) \ diff --git a/libguile/lightening/lightening/aarch64-fpu.c b/libguile/lightening/lightening/aarch64-fpu.c index 629734264..80dee334d 100644 --- a/libguile/lightening/lightening/aarch64-fpu.c +++ b/libguile/lightening/lightening/aarch64-fpu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013-2019 Free Software Foundation, Inc. + * Copyright (C) 2013-2019, 2025 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -638,6 +638,18 @@ movi_f(jit_state_t *_jit, int32_t r0, float i0) } } +static void +movr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + FMOVSW(_jit, r0, r1); +} + +static void +movr_i_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + FMOVWS(_jit, r0, r1); +} + static jit_reloc_t buneqr_f(jit_state_t *_jit, int32_t r0, int32_t r1) { @@ -759,6 +771,18 @@ movi_d(jit_state_t *_jit, int32_t r0, double i0) } } +static void +movr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + FMOVDX(_jit, r0, r1); +} + +static void +movr_l_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + FMOVXD(_jit, r0, r1); +} + static jit_reloc_t buneqr_d(jit_state_t *_jit, int32_t r0, int32_t r1) { diff --git a/libguile/lightening/lightening/aarch64.c b/libguile/lightening/lightening/aarch64.c index 1018193c4..65a8066bf 100644 --- a/libguile/lightening/lightening/aarch64.c +++ b/libguile/lightening/lightening/aarch64.c @@ -165,7 +165,6 @@ struct abi_arg_iterator }; static size_t page_size; -static int has_lse_atomics; # define HWCAP_ATOMICS (1 << 8) @@ -262,3 +261,9 @@ bless_function_pointer(void *ptr) { return ptr; } + +static jit_gpr_t +get_callr_temp (jit_state_t * _jit) +{ + return _LR; +} diff --git a/libguile/lightening/lightening/arm-vfp.c b/libguile/lightening/lightening/arm-vfp.c index 208edc316..63134dcf5 100644 --- a/libguile/lightening/lightening/arm-vfp.c +++ b/libguile/lightening/lightening/arm-vfp.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2017, 2019 Free Software Foundation, Inc. + * Copyright (C) 2012-2017, 2019, 2025 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -913,6 +913,18 @@ movi_d(jit_state_t *_jit, int32_t r0, jit_float64_t i0) } } +static void +movr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + VMOV_S_A(_jit, r0, r1); +} + +static void +movr_i_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + VMOV_A_S32(_jit, r0, r1); +} + static void extr_d_f(jit_state_t *_jit, int32_t r0, int32_t r1) { diff --git a/libguile/lightening/lightening/arm.c b/libguile/lightening/lightening/arm.c index d587e7158..11deedd89 100644 --- a/libguile/lightening/lightening/arm.c +++ b/libguile/lightening/lightening/arm.c @@ -109,7 +109,7 @@ next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg) } } *arg = jit_operand_mem (abi, JIT_SP, iter->stack_size); - iter->stack_size += 4; + iter->stack_size += 4 + (abi == JIT_OPERAND_ABI_DOUBLE ? 4 : 0); } static void @@ -137,3 +137,9 @@ bless_function_pointer(void *ptr) // Set low bit to mark as thumb mode. return (void*) (((uintptr_t)ptr) | 1); } + +static jit_gpr_t +get_callr_temp (jit_state_t * _jit) +{ + return _LR; +} diff --git a/libguile/lightening/lightening/endian.h b/libguile/lightening/lightening/endian.h index 3b34a1518..e3689a117 100644 --- a/libguile/lightening/lightening/endian.h +++ b/libguile/lightening/lightening/endian.h @@ -38,6 +38,8 @@ # else # define __WORDSIZE 64 # endif +# elif defined(__riscv_xlen) +# define __WORDSIZE __riscv_xlen /* riscv */ # else /* From FreeBSD 9.1 stdint.h */ # if defined(UINTPTR_MAX) && defined(UINT64_MAX) && \ (UINTPTR_MAX == UINT64_MAX) diff --git a/libguile/lightening/lightening/lightening.c b/libguile/lightening/lightening/lightening.c index db6e1176c..a1a225403 100644 --- a/libguile/lightening/lightening/lightening.c +++ b/libguile/lightening/lightening/lightening.c @@ -123,6 +123,8 @@ static void reset_abi_arg_iterator(struct abi_arg_iterator *iter, size_t argc, static void next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg); +static jit_gpr_t get_callr_temp (jit_state_t * _jit); + jit_bool_t init_jit(void) { @@ -268,6 +270,22 @@ get_temp_gpr(jit_state_t *_jit) #ifdef JIT_TMP1 case 1: return JIT_TMP1; +#endif +#ifdef JIT_TMP2 + case 2: + return JIT_TMP2; +#endif +#ifdef JIT_TMP3 + case 3: + return JIT_TMP3; +#endif +#ifdef JIT_TMP4 + case 4: + return JIT_TMP4; +#endif +#ifdef JIT_TMP5 + case 5: + return JIT_TMP5; #endif default: abort(); @@ -558,6 +576,8 @@ jit_emit_addr(jit_state_t *j) # include "aarch64.c" #elif defined(__s390__) || defined(__s390x__) # include "s390.c" +#elif defined(__riscv__) || defined(__riscv) +# include "riscv.c" #endif #define JIT_IMPL_0(stem, ret) \ @@ -786,6 +806,14 @@ abi_mem_to_gpr(jit_state_t *_jit, enum jit_operand_abi abi, case JIT_OPERAND_ABI_INT16: jit_ldxi_s(_jit, dst, base, offset); break; + case JIT_OPERAND_ABI_FLOAT: + { + jit_fpr_t tmp = get_temp_fpr(_jit); + jit_ldxi_f(_jit, tmp, base, offset); + jit_movr_i_f(_jit, dst, tmp); + unget_temp_fpr(_jit); + break; + } #if __WORDSIZE == 32 case JIT_OPERAND_ABI_UINT32: case JIT_OPERAND_ABI_POINTER: @@ -802,6 +830,14 @@ abi_mem_to_gpr(jit_state_t *_jit, enum jit_operand_abi abi, case JIT_OPERAND_ABI_INT64: jit_ldxi_l(_jit, dst, base, offset); break; + case JIT_OPERAND_ABI_DOUBLE: + { + jit_fpr_t tmp = get_temp_fpr(_jit); + jit_ldxi_d(_jit, tmp, base, offset); + jit_movr_l_d(_jit, dst, tmp); + unget_temp_fpr(_jit); + break; + } #endif default: abort(); @@ -866,7 +902,8 @@ enum move_kind { MOVE_KIND_ENUM(IMM, MEM), MOVE_KIND_ENUM(GPR, MEM), MOVE_KIND_ENUM(FPR, MEM), - MOVE_KIND_ENUM(MEM, MEM) + MOVE_KIND_ENUM(MEM, MEM), + MOVE_KIND_ENUM(FPR, GPR) }; #undef MOVE_KIND_ENUM @@ -880,6 +917,14 @@ move_operand(jit_state_t *_jit, jit_operand_t dst, jit_operand_t src) case MOVE_GPR_TO_GPR: return jit_movr(_jit, dst.loc.gpr.gpr, src.loc.gpr.gpr); + case MOVE_FPR_TO_GPR: +#if __WORDSIZE > 32 + if (src.abi == JIT_OPERAND_ABI_DOUBLE) + return jit_movr_l_d(_jit, dst.loc.gpr.gpr, src.loc.fpr); + else +#endif + return jit_movr_i_f(_jit, dst.loc.gpr.gpr, src.loc.fpr); + case MOVE_MEM_TO_GPR: return abi_mem_to_gpr(_jit, src.abi, dst.loc.gpr.gpr, src.loc.mem.base, src.loc.mem.offset); @@ -1095,6 +1140,15 @@ jit_move_operands(jit_state_t *_jit, jit_operand_t *dst, jit_operand_t *src, enum move_status status[argc]; for (size_t i = 0; i < argc; i++) status[i] = TO_MOVE; + + // Mem-to-mem moves require a temp register but don't overwrite + // other argument registers. Perform them first to free up the tmp + // for other uses. + for (size_t i = 0; i < argc; i++) + if ((status[i] == TO_MOVE) + && (MOVE_KIND (src[i].kind, dst[i].kind) == MOVE_MEM_TO_MEM)) + move_one(_jit, dst, src, argc, status, i); + for (size_t i = 0; i < argc; i++) if (status[i] == TO_MOVE) move_one(_jit, dst, src, argc, status, i); @@ -1155,6 +1209,9 @@ static const jit_gpr_t user_callee_save_gprs[] = { #endif #ifdef JIT_V9 , JIT_V9 +#endif +#ifdef JIT_V10 + , JIT_V10 #endif }; @@ -1183,6 +1240,18 @@ static const jit_fpr_t user_callee_save_fprs[] = { #ifdef JIT_VF7 , JIT_VF7 #endif +#ifdef JIT_VF8 + , JIT_VF8 +#endif +#ifdef JIT_VF9 + , JIT_VF9 +#endif +#ifdef JIT_VF10 + , JIT_VF10 +#endif +#ifdef JIT_VF11 + , JIT_VF11 +#endif }; #define ARRAY_SIZE(X) (sizeof (X)/sizeof ((X)[0])) @@ -1235,11 +1304,23 @@ jit_leave_jit_abi(jit_state_t *_jit, size_t v, size_t vf, size_t frame_size) // Precondition: stack is already aligned. static size_t -prepare_call_args(jit_state_t *_jit, size_t argc, jit_operand_t args[]) +prepare_call_args(jit_state_t *_jit, size_t argc, jit_operand_t args[], + jit_gpr_t *fun) { - jit_operand_t dst[argc]; + size_t count = argc + (fun == NULL ? 0 : 1); + jit_operand_t src[count]; + jit_operand_t dst[count]; + + memcpy (src, args, sizeof (jit_operand_t) * argc); + if (fun != NULL) { + jit_gpr_t fun_tmp = argc == 0 ? *fun : get_callr_temp (_jit); + src[argc] = jit_operand_gpr (JIT_OPERAND_ABI_POINTER, *fun); + dst[argc] = jit_operand_gpr (JIT_OPERAND_ABI_POINTER, fun_tmp); + *fun = fun_tmp; + } + struct abi_arg_iterator iter; - + // Compute shuffle destinations and space for spilled arguments. reset_abi_arg_iterator(&iter, argc, args); for (size_t i = 0; i < argc; i++) @@ -1264,7 +1345,7 @@ prepare_call_args(jit_state_t *_jit, size_t argc, jit_operand_t args[]) } } - jit_move_operands(_jit, dst, args, argc); + jit_move_operands(_jit, dst, src, count); return stack_size; } @@ -1272,7 +1353,7 @@ prepare_call_args(jit_state_t *_jit, size_t argc, jit_operand_t args[]) void jit_calli(jit_state_t *_jit, jit_pointer_t f, size_t argc, jit_operand_t args[]) { - size_t stack_bytes = prepare_call_args(_jit, argc, args); + size_t stack_bytes = prepare_call_args(_jit, argc, args, NULL); calli(_jit, (jit_word_t)f); @@ -1282,7 +1363,7 @@ jit_calli(jit_state_t *_jit, jit_pointer_t f, size_t argc, jit_operand_t args[]) void jit_callr(jit_state_t *_jit, jit_gpr_t f, size_t argc, jit_operand_t args[]) { - size_t stack_bytes = prepare_call_args(_jit, argc, args); + size_t stack_bytes = prepare_call_args(_jit, argc, args, &f); callr(_jit, jit_gpr_regno(f)); diff --git a/libguile/lightening/lightening/riscv-cpu.c b/libguile/lightening/lightening/riscv-cpu.c new file mode 100644 index 000000000..101f7395b --- /dev/null +++ b/libguile/lightening/lightening/riscv-cpu.c @@ -0,0 +1,2479 @@ +/* + * Copyright (C) 2012-2024 Free Software Foundation, Inc. + * + * This file is part of GNU lightning. + * + * GNU lightning is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU lightning is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * Authors: + * Paulo Cesar Pereira de Andrade + * Ekaitz Zarraga + */ + +#if __BYTE_ORDER != __LITTLE_ENDIAN +#error RISC-V requires little-endian host +#endif + +#define stack_framesize (200 + 64) +#define simm6_p(im) ((im) <= 31 && (im) >= -32) +#define simm12_p(im) ((im) <= 2047 && (im) >= -2048) +#define simm20_p(im) ((im) <= 524287 && (im) >= -524288) +#define simm32_p(im) ((im) <= 2147483647LL && (im) >= -2147483648LL) + +typedef union { + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t funct7 : 7; + } R; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t rl : 1; + uint32_t aq : 1; + uint32_t funct5 : 5; + } R4; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t imm11_0 : 12; + } I; +#if __WORDSIZE == 64 + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t shamt : 6; + uint32_t imm6_0 : 6; + } IS; +#endif + struct { + uint32_t opcode : 7; + uint32_t imm4_0 : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t imm11_5 : 7; + } S; + struct { + uint32_t opcode : 7; + uint32_t imm11 : 1; + uint32_t imm4_1 : 4; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t imm10_5 : 6; + uint32_t imm12 : 1; + } B; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t imm31_12 : 20; + } U; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t imm19_12 : 8; + uint32_t imm11 : 1; + uint32_t imm10_1 : 10; + uint32_t imm20 : 1; + } J; + uint32_t w; +} instr_t; + + +// TODO: Compressed instruction support + +static uint32_t +Rtype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t rs2, + int32_t fct2) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(!(fct2 & ~0x7f)); + i.R.opcode = op; + i.R.rd = rd; + i.R.funct3 = fct; + i.R.rs1 = rs1; + i.R.rs2 = rs2; + i.R.funct7 = fct2; + return i.w; +} + +static uint32_t +R4type(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t rs2, + int32_t aq, int32_t rl, int32_t fct5) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(!(fct5 & ~0x1f)); + assert(!(aq & ~0x01)); + assert(!(rl & ~0x01)); + i.R4.opcode = op; + i.R4.rd = rd; + i.R4.funct3 = fct; + i.R4.rs1 = rs1; + i.R4.rs2 = rs2; + i.R4.aq = aq; + i.R4.rl = rl; + i.R4.funct5 = fct5; + return i.w; +} + +static uint32_t +Itype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(simm12_p(imm)); + i.I.opcode = op; + i.I.rd = rd; + i.I.funct3 = fct; + i.I.rs1 = rs1; + i.I.imm11_0 = imm; + return i.w; +} + +# if __WORDSIZE == 64 + static uint32_t +IStype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t sh, + int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(sh & ~0x3f)); + assert(simm6_p(imm)); + i.IS.opcode = op; + i.IS.rd = rd; + i.IS.funct3 = fct; + i.IS.rs1 = rs1; + i.IS.shamt = sh; + i.IS.imm6_0 = imm; + return i.w; +} +# endif + +static uint32_t +Stype(int32_t op, int32_t fct, int32_t rs1, int32_t rs2, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(simm12_p(imm)); + i.S.opcode = op; + i.S.imm4_0 = imm & 0x1f; + i.S.funct3 = fct; + i.S.rs1 = rs1; + i.S.rs2 = rs2; + i.S.imm11_5 = (imm >> 5) & 0x7f; + return i.w; +} + +static uint32_t +Btype(int32_t op, int32_t fct, int32_t rs1, int32_t rs2, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(!(imm & 1)); + assert(simm12_p(imm >> 1)); + + i.B.opcode = op; + i.B.imm11 = (imm >> 11) & 0x1; + i.B.imm4_1 = (imm >> 1) & 0xf; + i.B.funct3 = fct; + i.B.rs1 = rs1; + i.B.rs2 = rs2; + i.B.imm10_5 = (imm >> 5) & 0x3f; + i.B.imm12 = (imm >> 12) & 0x1; + return i.w; +} + +static uint32_t +Utype(int32_t op, int32_t rd, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(simm20_p(imm)); + i.U.opcode = op; + i.U.rd = rd; + i.U.imm31_12= imm; + return i.w; +} + +static uint32_t +Jtype(int32_t op, int32_t rd, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(imm & 1)); + assert(simm20_p(imm >> 1)); + + i.J.opcode = op; + i.J.rd = rd; + i.J.imm19_12= (imm >> 12) & 0xff; + i.J.imm11 = (imm >> 11) & 0x1; + i.J.imm10_1 = (imm >> 1) & 0x3ff; + i.J.imm20 = (imm >> 20) & 0x1; + return i.w; +} + +/* + * RV32I Base Instruction Set + */ +#define _LUI(rd, imm) Utype(55, rd, imm) +#define _AUIPC(rd, imm) Utype(23, rd, imm) +#define _JAL(rd, imm) Jtype(111, rd, imm) +#define _JALR(rd, rs1, imm) Itype(103, rd, 0, rs1, imm) +#define _BEQ(rs1, rs2, imm) Btype(99, 0, rs1, rs2, imm) +#define _BNE(rs1, rs2, imm) Btype(99, 1, rs1, rs2, imm) +#define _BLT(rs1, rs2, imm) Btype(99, 4, rs1, rs2, imm) +#define _BGE(rs1, rs2, imm) Btype(99, 5, rs1, rs2, imm) +#define _BLTU(rs1, rs2, imm) Btype(99, 6, rs1, rs2, imm) +#define _BGEU(rs1, rs2, imm) Btype(99, 7, rs1, rs2, imm) +#define _LB(rd, rs1, imm) Itype(3, rd, 0, rs1, imm) +#define _LH(rd, rs1, imm) Itype(3, rd, 1, rs1, imm) +#define _LW(rd, rs1, imm) Itype(3, rd, 2, rs1, imm) +#define _LBU(rd, rs1, imm) Itype(3, rd, 4, rs1, imm) +#define _LHU(rd, rs1, imm) Itype(3, rd, 5, rs1, imm) +#define _SB(rs1, rs2, imm) Stype(35, 0, rs1, rs2, imm) +#define _SH(rs1, rs2, imm) Stype(35, 1, rs1, rs2, imm) +#define _SW(rs1, rs2, imm) Stype(35, 2, rs1, rs2, imm) +#define _ADDI(rd, rs1, imm) Itype(19, rd, 0, rs1, imm) +#define _SLTI(rd, rs1, imm) Itype(19, rd, 2, rs1, imm) +#define _SLTIU(rd, rs1, imm) Itype(19, rd, 3, rs1, imm) +#define _XORI(rd, rs1, imm) Itype(19, rd, 4, rs1, imm) +#define _ORI(rd, rs1, imm) Itype(19, rd, 6, rs1, imm) +#define _ANDI(rd, rs1, imm) Itype(19, rd, 7, rs1, imm) +#if __WORDSIZE == 32 +# define _SLLI(rd, rs1, imm) Rtype(19, rd, 1, rs1, imm, 0) +# define _SRLI(rd, rs1, imm) Rtype(19, rd, 5, rs1, imm, 0) +# define _SRAI(rd, rs1, imm) Rtype(19, rd, 5, rs1, imm, 32) +#endif +#define _ADD(rd, rs1, rs2) Rtype(51, rd, 0, rs1, rs2, 0) +#define _SUB(rd, rs1, rs2) Rtype(51, rd, 0, rs1, rs2, 32) +#define _SLL(rd, rs1, rs2) Rtype(51, rd, 1, rs1, rs2, 0) +#define _SLT(rd, rs1, rs2) Rtype(51, rd, 2, rs1, rs2, 0) +#define _SLTU(rd, rs1, rs2) Rtype(51, rd, 3, rs1, rs2, 0) +#define _XOR(rd, rs1, rs2) Rtype(51, rd, 4, rs1, rs2, 0) +#define _SRL(rd, rs1, rs2) Rtype(51, rd, 5, rs1, rs2, 0) +#define _SRA(rd, rs1, rs2) Rtype(51, rd, 5, rs1, rs2, 32) +#define _OR(rd, rs1, rs2) Rtype(51, rd, 6, rs1, rs2, 0) +#define _AND(rd, rs1, rs2) Rtype(51, rd, 7, rs1, rs2, 0) +#define _FENCE(imm) Itype( 15, 0, 0, 0, imm) +#define _FENCE_I(imm) Itype( 15, 0, 1, 0, imm) +#define _ECALL() Itype(115, 0, 0, 0, 0) +#define _EBREAK() Itype(115, 0, 0, 0, 1) +#define _CSRRW(rd, rs1, csr) Itype(115, rd, 1, rs1, csr) +#define _CSRRS(rd, rs1, csr) Itype(115, rd, 2, rs1, csr) +#define _CSRRC(rd, rs1, csr) Itype(115, rd, 3, rs1, csr) +#define _CSRRWI(rd, zimm, csr) Itype(115, rd, 5, zimm, csr) +#define _CSRRSI(rd, zimm, csr) Itype(115, rd, 6, zimm, csr) +#define _CSRRCI(rd, zimm, csr) Itype(115, rd, 7, zimm, csr) +/* + * RV64I Base Instruction Set (in addition to RV32I) + */ +#define _LWU(rd, rs1, imm) Itype(3, rd, 6, rs1, imm) +#define _LD(rd, rs1, imm) Itype(3, rd, 3, rs1, imm) +#define _SD(rs1, rs2, imm) Stype(35, 3, rs1, rs2, imm) +#if __WORDSIZE == 64 +# define _SLLI(rd, rs1, sh) IStype(19, rd, 1, rs1, sh, 0) +# define _SRLI(rd, rs1, sh) IStype(19, rd, 5, rs1, sh, 0) +# define _SRAI(rd, rs1, sh) IStype(19, rd, 5, rs1, sh, 16) +#endif +#define _ADDIW(rd, rs1, imm) Itype(27, rd, 0, rs1, imm) +#define _SLLIW(rd, rs1, imm) Rtype(27, rd, 1, rs1, imm, 0) +#define _SRLIW(rd, rs1, imm) Rtype(27, rd, 3, rs1, imm, 0) +#define _SRAIW(rd, rs1, imm) Rtype(27, rd, 3, rs1, imm, 32) +#define _ADDW(rd, rs1, imm) Rtype(59, rd, 0, rs1, imm, 0) +#define _SUBW(rd, rs1, imm) Rtype(59, rd, 0, rs1, imm, 32) +#define _SLLW(rd, rs1, imm) Rtype(59, rd, 1, rs1, imm, 0) +#define _SRLW(rd, rs1, imm) Rtype(59, rd, 5, rs1, imm, 0) +#define _SRAW(rd, rs1, imm) Rtype(59, rd, 5, rs1, imm, 32) +/* + * RV32M Standard Extension + */ +#define _MUL(rd, rs1, rs2) Rtype(51, rd, 0, rs1, rs2, 1) +#define _MULH(rd, rs1, rs2) Rtype(51, rd, 1, rs1, rs2, 1) +#define _MULHSU(rd, rs1, rs2) Rtype(51, rd, 2, rs1, rs2, 1) +#define _MULHU(rd, rs1, rs2) Rtype(51, rd, 3, rs1, rs2, 1) +#define _DIV(rd, rs1, rs2) Rtype(51, rd, 4, rs1, rs2, 1) +#define _DIVU(rd, rs1, rs2) Rtype(51, rd, 5, rs1, rs2, 1) +#define _REM(rd, rs1, rs2) Rtype(51, rd, 6, rs1, rs2, 1) +#define _REMU(rd, rs1, rs2) Rtype(51, rd, 7, rs1, rs2, 1) +/* + * RV64M Standard Extension (in addition to RV32M) + */ +#define _MULW(rd, rs1, rs2) Rtype(59, rd, 0, rs1, rs2, 1) +#define _DIVW(rd, rs1, rs2) Rtype(59, rd, 4, rs1, rs2, 1) +#define _DIVUW(rd, rs1, rs2) Rtype(59, rd, 5, rs1, rs2, 1) +#define _REMW(rd, rs1, rs2) Rtype(59, rd, 6, rs1, rs2, 1) +#define _REMUW(rd, rs1, rs2) Rtype(59, rd, 7, rs1, rs2, 1) +/* + * RV32A Standard Extension + */ +#define _LR_W(rd, rs1, rl, aq) R4type(47, rd, 2, rs1, 0, rl, aq, 2) +#define _SC_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 3) +#define _AMOSWAP_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 1) +#define _AMOADD_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 0) +#define _AMOXOR_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 4) +#define _AMOAND_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 12) +#define _AMOOR_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 8) +#define _AMOMIN_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 16) +#define _AMOMAX_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 20) +#define _AMOMINU_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 24) +#define _AMOMAXU_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 28) +/* + * RV64A Standard Extension (in addition to RV32A) + */ +#define _LR_D(rd, rs1, rl, aq) R4type(47, rd, 3, rs1, 0, rl, aq, 2) +#define _SC_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 3) +#define _AMOSWAP_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 1) +#define _AMOADD_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 0) +#define _AMOXOR_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 4) +#define _AMOAND_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 12) +#define _AMOOR_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 8) +#define _AMOMIN_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 16) +#define _AMOMAX_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 20) +#define _AMOMINU_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 24) +#define _AMOMAXU_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 28) +/* + * Pseudo Instructions + */ +#define _NOP() _ADDI((jit_gpr_regno(_ZERO)),\ + (jit_gpr_regno(_ZERO)), 0) +#define _MV(r0, r1) _ADDI(r0, r1, 0) +#define _NOT(r0, r1) _XORI(r0, r1, -1) +#define _NEG(r0, r1) _SUB(r0, (jit_gpr_regno(_ZERO)), r1) +#define _NEGW(r0, r1) _SUBW(r0, (jit_gpr_regno(_ZERO)), r1) +#define _SEXT_W(r0, r1) _ADDIW(r0, r1, 0) +#define _RET() _JALR((jit_gpr_regno(_ZERO)),\ + (jit_gpr_regno(_RA)), 0) + + + +// Help to make all easier +#define em_wp(jit, inst) emit_u32_with_pool(jit, inst) + +/* + * JIT INSTRUCTIONS + */ + +// Binary ALU operations +static void addr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void addcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void addxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +static void subr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void subcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void subxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +static void muli(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void mulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); + +static void divr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void divr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +static void remi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void remr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void remi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void remr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); + +static void andr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void andi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void orr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void xorr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void xori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void lshi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void lshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void rshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void rshi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0); +static void rshr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void rshi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0); + + +// Four operand ALU operations +static void qmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qmulr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qmuli(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); +static void qmuli_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); + +static void qdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qdivr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qdivi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); +static void qdivi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); + + +// Unary ALU operations +static void negr(jit_state_t *_jit, int32_t r0, int32_t r1); +static void comr(jit_state_t *_jit, int32_t r0, int32_t r1); + + +// Transfer operations +static void movr(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movi(jit_state_t *_jit, int32_t r0, jit_word_t i0); + +static uint64_t patch_load_from_pool(uint64_t instrs, int32_t off); +static jit_reloc_t emit_load_from_pool(jit_state_t *_jit, uint64_t insts); +static jit_reloc_t mov_addr(jit_state_t *_jit, int32_t r0); +static jit_reloc_t movi_from_pool(jit_state_t *_jit, int32_t r0); + +static void extr_c(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_uc(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_s(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_us(jit_state_t *_jit, int32_t r0, int32_t r1); + +# if __WORDSIZE == 64 +static void extr_i(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_ui(jit_state_t *_jit, int32_t r0, int32_t r1); +#endif + + +// Branch instructions +static uint32_t patch_cc_jump(uint32_t inst, int32_t offset); +static jit_reloc_t emit_cc_jump(jit_state_t *_jit, uint32_t inst); + +static jit_reloc_t bltr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blti(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bltr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blti_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bler(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blei(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bler_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blei_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t beqr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t beqi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bger(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgei(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bger_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgei_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bgtr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgti(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bgtr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgti_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bner(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bnei(jit_state_t *_jit, int32_t r0, jit_word_t i1); + +static jit_reloc_t bmsr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bmsi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bmcr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bmci(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t boaddr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t boaddi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t boaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t boaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxaddr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxaddi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bosubr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bosubi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bosubr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bosubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxsubr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxsubi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxsubr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxsubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); + + +// Store operations +static void str_c(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_uc(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_s(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_i(jit_state_t *_jit, int32_t r0, int32_t r1); +#if __WORDSIZE == 64 +static void str_i(jit_state_t *_jit, int32_t r0, int32_t r1); +#endif + +static void sti_c(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void sti_s(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void sti_i(jit_state_t *_jit, jit_word_t i0, int32_t r0); +#if __WORDSIZE == 64 +static void sti_l(jit_state_t *_jit, jit_word_t i0, int32_t r0); +#endif + +static void stxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +# if __WORDSIZE == 64 +static void stxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +#endif + +static void stxi_c(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +static void stxi_s(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +static void stxi_i(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +# if __WORDSIZE == 64 +static void stxi_l(jit_state_t *_jit,jit_word_t i0,int32_t r0,int32_t r1); +# endif + + +// Load operations +static void ldr_c(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_uc(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_s(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_us(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_i(jit_state_t *_jit, int32_t r0, int32_t r1); +# if __WORDSIZE == 64 +static void ldr_ui(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_l(jit_state_t *_jit, int32_t r0, int32_t r1); +# endif + +static void ldi_c(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_uc(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_s(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_us(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_i(jit_state_t *_jit, int32_t r0, jit_word_t i0); +# if __WORDSIZE == 64 +static void ldi_ui(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_l(jit_state_t *_jit, int32_t r0, jit_word_t i0); +# endif + +static void ldxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_uc(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_us(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +# if __WORDSIZE == 64 +static void ldxr_ui(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +#endif + +static void ldxi_c(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_uc(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_us(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_s(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_i(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +# if __WORDSIZE == 64 +static void ldxi_ui(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_l(jit_state_t *_jit,int32_t r0,int32_t r1,jit_word_t i0); +#endif + + +// Argument management +//static void pushr(jit_state_t *_jit, int32_t r0); +//static void popr(jit_state_t *_jit, int32_t r0); +static void ret(jit_state_t *_jit); +static void retr(jit_state_t *_jit, int32_t r0); +static void reti(jit_state_t *_jit, jit_word_t i0); +static void retval_c(jit_state_t *_jit, int32_t r0); +static void retval_uc(jit_state_t *_jit, int32_t r0); +static void retval_s(jit_state_t *_jit, int32_t r0); +static void retval_us(jit_state_t *_jit, int32_t r0); +static void retval_i(jit_state_t *_jit, int32_t r0); +# if __WORDSIZE == 64 +static void retval_ui(jit_state_t *_jit, int32_t r0); +static void retval_l(jit_state_t *_jit, int32_t r0); +#endif + +// Jump and return +static uint32_t patch_jump(uint32_t inst, int32_t offset); +static jit_reloc_t emit_jump(jit_state_t *_jit, uint32_t inst); + +static void callr(jit_state_t *_jit, int32_t r0); +static void calli(jit_state_t *_jit, jit_word_t i0); +static void jmpi_with_link(jit_state_t *_jit, jit_word_t i0); +static void pop_link_register(jit_state_t *_jit); +static void push_link_register(jit_state_t *_jit); +static void jmpr(jit_state_t *_jit, int32_t r0); +static void jmpi(jit_state_t *_jit, jit_word_t i0); +static jit_reloc_t jmp(jit_state_t *_jit); + + +// Atomic operations +static void ldr_atomic(jit_state_t *_jit, int32_t dst, int32_t loc); +static void str_atomic(jit_state_t *_jit, int32_t loc, int32_t val); +static void swap_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, + int32_t val); +static void cas_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, + int32_t expected, int32_t desired); + +// Byte swapping operations +static void bswapr_us(jit_state_t *_jit, int32_t r0, int32_t r1); +static void bswapr_ui(jit_state_t *_jit, int32_t r0, int32_t r1); +# if __WORDSIZE == 64 +static void +bswapr_ul(jit_state_t *_jit, int32_t r0, int32_t r1); +#endif + +// Others +static void nop(jit_state_t *_jit, int32_t im); +static void mfence(jit_state_t *_jit); +static void breakpoint(jit_state_t *_jit); + + + +/* + * Binary ALU operations + */ +static void +addr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _ADD(r0, r1, r2)); +} +static void +addi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _ADDI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + addr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +addcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + // TODO: Not sure if this is correct + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), jit_gpr_regno(t0), r1)); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addr(_jit, r0, r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r0, r1)); + } +} + +static void +addci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), jit_gpr_regno(t0), r1)); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addi(_jit, r0, r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r0, r1)); + } +} + +static void +addxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + addcr(_jit, r0, r1, r2); + addcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +addxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + addci(_jit, r0, r1, i0); + addcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +subr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SUB(r0, r1, r2)); +} + +static void +subi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + addi(_jit, r0, r1, -i0); +} + +static void +subcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + subr(_jit, jit_gpr_regno(t0), r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addr(_jit, r0, r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, r0)); + } +} + +static void +subci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + subi(_jit, jit_gpr_regno(t0), r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addi(_jit, r0, r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, r0)); + } +} + +static void +subxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + subcr(_jit, r0, r1, r2); + subcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +subxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + subci(_jit, r0, r1, i0); + subcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +muli(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + mulr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +mulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _MUL(r0, r1, r2)); +} + +static void +divr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _DIV(r0, r1, r2)); +} + +static void +divi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + divr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +divr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _DIVU(r0, r1, r2)); +} + +static void +divi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + divr_u(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +remi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + remr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +remr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _REM(r0, r1, r2)); +} +static void +remi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + remr_u(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +remr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _REMU(r0, r1, r2)); +} + +static void +andr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _AND(r0, r1, r2)); +} + +static void +andi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _ANDI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + em_wp(_jit, _AND(r0, r1, jit_gpr_regno(t0))); + unget_temp_gpr(_jit); + } +} + +static void +orr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _OR(r0, r1, r2)); +} + +static void +ori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _ORI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + orr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +xorr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _XOR(r0, r1, r2)); +} + +static void +xori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _XORI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + xorr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +lshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SLL(r0, r1, r2)); +} + +static void +lshi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _SLLI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + lshr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +rshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SRA(r0, r1, r2)); +} + +static void +rshi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _SRAI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + rshr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +rshr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SRL(r0, r1, r2)); +} + +static void +rshi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _SRLI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + rshr_u(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + + +/* + * Four operand ALU operations + */ +static void +iqmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3, + jit_bool_t sign){ + if(r0 == r2 || r0 == r3){ + jit_gpr_t t0 = get_temp_gpr(_jit); + em_wp(_jit, _MUL(jit_gpr_regno(t0), r2, r3)); + if(sign) + em_wp(_jit, _MULH(r1, r2, r3)); + else + em_wp(_jit, _MULHU(r1, r2, r3)); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + em_wp(_jit, _MUL(r0, r2, r3)); + if(sign) + em_wp(_jit, _MULH(r1, r2, r3)); + else + em_wp(_jit, _MULHU(r1, r2, r3)); +} + +static void +qmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqmulr(_jit, r0, r1, r2, r3, 1); +} + +static void +qmulr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqmulr(_jit, r0, r1, r2, r3, 0); +} + +static void +qmuli(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqmulr(_jit, r0, r1, r2, jit_gpr_regno(t0), 1); + unget_temp_gpr(_jit); +} + +static void +qmuli_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqmulr(_jit, r0, r1, r2, jit_gpr_regno(t0), 0); + unget_temp_gpr(_jit); +} + +static void +iqdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3, + jit_bool_t sign){ + if(r0 == r2 || r0 == r3){ + jit_gpr_t t0 = get_temp_gpr(_jit); + if(sign){ + em_wp(_jit, _DIV(jit_gpr_regno(t0), r2, r3)); + em_wp(_jit, _REM(r1, r2, r3)); + } else { + em_wp(_jit, _DIVU(jit_gpr_regno(t0), r2, r3)); + em_wp(_jit, _REMU(r1, r2, r3)); + } + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + if(sign){ + em_wp(_jit, _DIV(r0, r2, r3)); + em_wp(_jit, _REM(r1, r2, r3)); + } else { + em_wp(_jit, _DIVU(r0, r2, r3)); + em_wp(_jit, _REMU(r1, r2, r3)); + } +} + +static void +qdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqdivr(_jit, r0, r1, r2, r3, 1); +} + +static void +qdivr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqdivr(_jit, r0, r1, r2, r3, 0); +} + +static void +qdivi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqdivr(_jit, r0, r1, r2, jit_gpr_regno(t0), 1); + unget_temp_gpr(_jit); +} + +static void +qdivi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqdivr(_jit, r0, r1, r2, jit_gpr_regno(t0), 0); + unget_temp_gpr(_jit); +} + + +/* + * Unary ALU operations + */ +static void +negr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _NEG(r0, r1)); +} + +static void +comr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _NOT(r0, r1)); +} + + +/* + * Branch instructions + */ +static uint32_t +patch_cc_jump(uint32_t inst, int32_t offset){ + instr_t i; + i.w = inst; + return Btype(i.B.opcode, i.B.funct3, i.B.rs1, i.B.rs2, offset); +} + +static jit_reloc_t +emit_cc_jump(jit_state_t *_jit, uint32_t inst) +{ + while (1) { + uint8_t *pc_base = _jit->pc.uc; // Offset is from current PC + int32_t off = (uint8_t*)jit_address(_jit) - pc_base; + jit_reloc_t ret = + jit_reloc (_jit, JIT_RELOC_JCC_WITH_VENEER, 0, _jit->pc.uc, pc_base, 0); + uint8_t cc_jump_width = 12; + if (add_pending_literal(_jit, ret, cc_jump_width - 1)) { + em_wp(_jit, patch_cc_jump(inst, off)); + return ret; + } + } +} + +static jit_reloc_t +bltr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BLT(r0, r1, 0)); +} + +static jit_reloc_t +blti(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bltr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BLTU(r0, r1, 0)); +} + +static jit_reloc_t +blti_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bltr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGE(r1, r0, 0)); +} + +static jit_reloc_t +blei(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bler(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGEU(r1, r0, 0)); +} + +static jit_reloc_t +blei_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bler_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +beqr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BEQ(r0, r1, 0)); +} + +static jit_reloc_t +beqi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = beqr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGE(r0, r1, 0)); +} + +static jit_reloc_t +bgei(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bger(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGEU(r0, r1, 0)); +} + +static jit_reloc_t +bgei_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bger_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bgtr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr(_jit, r1, r0); +} + +static jit_reloc_t +bgti(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bgtr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bgtr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr_u(_jit, r1, r0); +} + +static jit_reloc_t +bgti_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bgtr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bner(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BNE(r0, r1, 0)); +} + +static jit_reloc_t +bnei(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bner(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmsr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andr(_jit, jit_gpr_regno(t0), r0, r1); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmsi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andi(_jit, jit_gpr_regno(t0), r0, i0); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmcr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andr(_jit, jit_gpr_regno(t0), r0, r1); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmci(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andi(_jit, jit_gpr_regno(t0), r0, i0); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + // NOTE: We need tons of temporaries because RISC-V doesn't provide any + // easy way to solve this. We need to do it in software. + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = boaddr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + + jit_reloc_t ret = bnei(_jit, jit_gpr_regno(t1), 0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = boaddr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxaddr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + + jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxaddr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bosubr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 1); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bosubr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxsubr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxsubr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + + +/* + * Transfer operations + */ +static void +movr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + if (r0 != r1) + em_wp(_jit, _MV(r0, r1)); +} + + +static int +count_trailing_zeros(uint64_t x) +{ + if(x == 0) + return 64; + int count = 0; + while((x & 0x1) == 0){ + x >>= 1; + count++; + } + return count; +} + +static void +movi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + int32_t srcreg = jit_gpr_regno(_ZERO); + + if (simm32_p(i0)){ + int32_t hi = (int32_t)(((i0 + 0x800) >> 12) & 0xFFFFF) << 12 >> 12; + int32_t lo = (int32_t)i0<<20>>20; + + if(hi){ + em_wp(_jit, _LUI(r0, hi)); + srcreg = r0; + } + + if(lo || hi == 0){ +#if __WORDSIZE == 64 + em_wp(_jit, _ADDIW(r0, srcreg, lo)); +#elif __WORDSIZE == 32 + em_wp(_jit, _ADDI(r0, srcreg, lo)); +#endif + } + + } else { + // 64 bits: load in various steps + int64_t lo12 = i0 << 52 >> 52; + int64_t hi52 = (i0 + 0x800) >> 12; + int shift_amount = 12 + count_trailing_zeros((uint64_t) hi52); + hi52 = (hi52 >> (shift_amount - 12)) << shift_amount >> shift_amount; + movi(_jit, r0, hi52); // Recurse + em_wp(_jit, _SLLI(r0, r0, shift_amount)); + if (lo12) { + em_wp(_jit, _ADDI(r0, r0, lo12)); + } + } +} + +typedef union{ + struct{ + instr_t auipc; + instr_t load; // `ld` in RV64 and `lw` in RV32 + } inst; + uint64_t l; +} load_from_pool_t; + +static uint64_t +patch_load_from_pool(uint64_t instrs, int32_t off){ + + load_from_pool_t out, in; + int32_t hi = (int32_t)(((off + 0x800) >> 12) & 0xFFFFF) << 12 >> 12; + int32_t lo = (int32_t)off<<20>>20; + in.l = instrs; + out.inst.auipc.w = _AUIPC(in.inst.auipc.U.rd, hi); + out.inst.load.w = Itype(in.inst.load.I.opcode, // `ld` or `lw` + in.inst.load.I.rd, + in.inst.load.I.funct3, + in.inst.load.I.rs1, + lo); + return out.l; +} + +static jit_reloc_t +emit_load_from_pool(jit_state_t *_jit, uint64_t insts) +{ + while (1) { + uint8_t *pc_base = _jit->pc.uc; // Offset is from current PC + int32_t off = (_jit->pc.uc - pc_base); + jit_reloc_t ret = + jit_reloc (_jit, JIT_RELOC_LOAD_FROM_POOL, 0, _jit->pc.uc, pc_base, 0); + uint8_t load_from_pool_width = 32; + if (add_pending_literal(_jit, ret, load_from_pool_width)) { + emit_u64(_jit, patch_load_from_pool(insts, off)); + return ret; + } + } +} +static jit_reloc_t +movi_from_pool(jit_state_t *_jit, int32_t r0) +{ + load_from_pool_t insts; + insts.inst.auipc.w = _AUIPC(r0, 0); +#if __WORDSIZE == 64 + insts.inst.load.w = _LD(r0, r0, 0); +#elif __WORDSIZE == 32 + insts.inst.load.w = _LW(r0, r0, 0); +#endif + return emit_load_from_pool(_jit, insts.l); +} +static jit_reloc_t +mov_addr(jit_state_t *_jit, int32_t r0) +{ + return movi_from_pool(_jit, r0); +} + + +static void +extr_c(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 8; + lshi(_jit, r0, r1, rot); + rshi(_jit, r0, r0, rot); +} + +static void +extr_uc(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 8; + lshi(_jit, r0, r1, rot); + rshi_u(_jit, r0, r0, rot); +} + +static void +extr_s(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 16; + lshi(_jit, r0, r1, rot); + rshi(_jit, r0, r0, rot); +} + +static void +extr_us(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 16; + lshi(_jit, r0, r1, rot); + rshi_u(_jit, r0, r0, rot); +} + +# if __WORDSIZE == 64 +static void +extr_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 32; + lshi(_jit, r0, r1, rot); + rshi(_jit, r0, r0, rot); +} +static void +extr_ui(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 32; + lshi(_jit, r0, r1, rot); + rshi_u(_jit, r0, r0, rot); +} +#endif + +/* + * Store operations + */ +static void +str_c(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SB(r0, r1, 0)); +} +static void +str_uc(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SB(r0, r1, 0)); +} +static void +str_s(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SH(r0, r1, 0)); +} +static void +str_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SW(r0, r1, 0)); +} +#if __WORDSIZE == 64 +static void +str_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SD(r0, r1, 0)); +} +#endif + +static void +sti_c(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_c(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} + +static void +sti_s(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_s(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} + +static void +sti_i(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_i(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} + +#if __WORDSIZE == 64 +static void +sti_l(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_l(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} +#endif + +static void +stxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_c(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} + +static void +stxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_s(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} + +static void +stxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_i(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} + +# if __WORDSIZE == 64 +static void +stxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_l(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} +#endif + +static void +stxi_c(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SB(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_c(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + + +static void +stxi_s(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SH(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_s(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + + +static void +stxi_i(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SW(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_i(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + +# if __WORDSIZE == 64 +static void +stxi_l(jit_state_t *_jit,jit_word_t i0,int32_t r0,int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_l(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} +# endif + + +/* + * Load operations + */ +static void +ldr_c(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LB(r0, r1, 0)); +} + +static void +ldr_uc(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LBU(r0, r1, 0)); +} + +static void +ldr_s(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LH(r0, r1, 0)); +} + +static void +ldr_us(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LHU(r0, r1, 0)); +} + +static void +ldr_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LW(r0, r1, 0)); +} + +# if __WORDSIZE == 64 +static void +ldr_ui(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LWU(r0, r1, 0)); +} + +static void +ldr_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LD(r0, r1, 0)); +} +# endif + + +static void +ldi_c(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_c(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_uc(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_uc(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_s(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_s(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_us(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_us(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + + +static void +ldi_i(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_i(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +# if __WORDSIZE == 64 +static void +ldi_ui(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_ui(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_l(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_l(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +#endif + + + + +static void +ldxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_c(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_uc(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_uc(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_s(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_us(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_us(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_i(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +# if __WORDSIZE == 64 +static void +ldxr_ui(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_ui(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_l(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +#endif + + + + +static void +ldxi_c(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LB(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_c(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_uc(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LBU(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_uc(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_us(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LHU(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_us(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_s(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LH(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_s(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_i(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LW(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_i(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +# if __WORDSIZE == 64 +static void +ldxi_ui(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LWU(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_ui(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_l(jit_state_t *_jit,int32_t r0,int32_t r1,jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_l(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +#endif + + +/* + * Argument management + */ + +// static void +// pushr(jit_state_t *_jit, int32_t r0) +// { +// #if __WORDSIZE == 64 +// addi(jit_gpr_regno(_SP), -8); +// em_wp(_SD(r0, jit_gpr_regno(_SP), 0)); +// #elif __WORDSIZE == 32 +// addi(jit_gpr_regno(_SP), -4); +// em_wp(_SW(r0, jit_gpr_regno(_SP), 0)); +// #endif +// } +// static void +// popr(jit_state_t *_jit, int32_t r0) +// { +// #if __WORDSIZE == 64 +// em_wp(_jit, _LD(r0, jit_gpr_regno(_SP), 0)); +// addi(jit_gpr_regno(_SP), 8); +// #elif __WORDSIZE == 32 +// em_wp(_jit, _LW(r0, jit_gpr_regno(_SP), 0)); +// addi(jit_gpr_regno(_SP), 4); +// #endif +// } + +static void +ret(jit_state_t *_jit) +{ + em_wp(_jit, _RET()); +} + +static void +retr(jit_state_t *_jit, int32_t r0) +{ + movr(_jit, jit_gpr_regno(_A0), r0); + ret(_jit); +} + +static void +reti(jit_state_t *_jit, jit_word_t i0) +{ + movi(_jit, jit_gpr_regno(_A0), i0); + ret(_jit); +} + +static void +retval_c(jit_state_t *_jit, int32_t r0) +{ + extr_c(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_uc(jit_state_t *_jit, int32_t r0) +{ + extr_uc(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_s(jit_state_t *_jit, int32_t r0) +{ + extr_s(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_us(jit_state_t *_jit, int32_t r0) +{ + extr_us(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_i(jit_state_t *_jit, int32_t r0) +{ + extr_i(_jit, r0, jit_gpr_regno(_A0)); +} + +# if __WORDSIZE == 64 +static void +retval_ui(jit_state_t *_jit, int32_t r0) +{ + extr_ui(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_l(jit_state_t *_jit, int32_t r0) +{ + movr(_jit, r0, jit_gpr_regno(_A0)); +} +#endif + +/* + * Jump and return instructions + */ +static uint32_t +patch_jump(uint32_t inst, int32_t offset) +{ + instr_t i; + i.w = inst; + return Jtype(i.J.opcode, i.J.rd, offset); +} +static jit_reloc_t +emit_jump(jit_state_t *_jit, uint32_t inst) +{ + while (1) { + uint8_t *pc_base = _jit->pc.uc; // Offset is from current PC + int32_t off = (uint8_t*)jit_address(_jit) - pc_base; + jit_reloc_t ret = + jit_reloc (_jit, JIT_RELOC_JMP_WITH_VENEER, 0, _jit->pc.uc, pc_base, 0); + uint8_t jump_width = 20; + if (add_pending_literal(_jit, ret, jump_width - 1)) { + em_wp(_jit, patch_jump(inst, off)); + return ret; + } + } +} + +static void +callr(jit_state_t *_jit, int32_t r0) +{ + em_wp(_jit, _JALR(jit_gpr_regno(_RA), r0, 0)); +} + +static void +calli(jit_state_t *_jit, jit_word_t i0) +{ + jit_word_t jumpoffset = i0 - (jit_word_t)(_jit->pc.uc); + if (simm20_p(jumpoffset)){ + em_wp(_jit, _JAL(jit_gpr_regno(_RA), jumpoffset)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + callr(_jit, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +jmpi_with_link(jit_state_t *_jit, jit_word_t i0) +{ + calli(_jit, i0); +} + +static void +pop_link_register(jit_state_t *_jit) +{ +} + +static void +push_link_register(jit_state_t *_jit) +{ +} + +static void +jmpr(jit_state_t *_jit, int32_t r0) +{ + em_wp(_jit, _JALR(jit_gpr_regno(_ZERO), r0, 0)); +} + +static void +jmpi(jit_state_t *_jit, jit_word_t i0) +{ + jit_word_t jumpoffset = i0 - (jit_word_t)(_jit->pc.uc); + if (simm20_p(jumpoffset)){ + em_wp(_jit, _JAL(jit_gpr_regno(_ZERO), jumpoffset)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jmpr(_jit, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static jit_reloc_t +jmp(jit_state_t *_jit) +{ + return emit_jump(_jit, _JAL(jit_gpr_regno(_ZERO), 0)); +} + + + +/* + * Atomic operations + */ + +static void +ldr_atomic(jit_state_t *_jit, int32_t dst, int32_t loc) +{ + em_wp(_jit, _FENCE(0xFF)); + ldr_i(_jit, dst, loc); + em_wp(_jit, _FENCE(0xFF)); +} + +static void +str_atomic(jit_state_t *_jit, int32_t loc, int32_t val) +{ + em_wp(_jit, _FENCE(0xFF)); + str_i(_jit, loc, val); + em_wp(_jit, _FENCE(0xFF)); +} + +static void +swap_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, int32_t val) +{ +#if __WORDSIZE == 64 + em_wp(_jit, _AMOSWAP_D(dst, loc, val, 1, 1)); +#elif __WORDSIZE == 32 + em_wp(_jit, _AMOSWAP_W(dst, loc, val, 1, 1)); +#endif +} + +static void +cas_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, int32_t expected, + int32_t desired) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + void *retry = jit_address(_jit); + +#if __WORDSIZE == 64 + em_wp(_jit, _LR_D(t0, loc, 0,0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _LR_W(t0, loc, 0,0)); +#endif + + jit_reloc_t fail = bner(_jit, t0, expected); + +#if __WORDSIZE == 64 + em_wp(_jit, _SC_D(t1, desired, loc, 0,0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _SC_W(t1, desired, loc, 0,0)); +#endif + + jit_patch_there(_jit, bner(_jit, t1, jit_gpr_regno(_ZERO)), retry); + + jit_patch_here(_jit, fail); + em_wp(_jit, _FENCE(0xFF)); + movr(_jit, dst, t0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); +} + + +/* + * Byte swapping operations + * RISC-V Doesn't provide them by default. + * There's a B extension (Standard Extension for Bit Manipulation) draft, but + * it's not official yet: + * https://github.com/riscv/riscv-bitmanip + * Meanwhile, we need to implement them in software. + */ +static void +bswapr_uany(jit_state_t *_jit, int32_t r0, int32_t r1, size_t size) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + andi(_jit, r0, r1, 0xFF); + for(int i = 1; i < size; i++){ + lshi(_jit, r0, r0, 8); + rshi(_jit, t0, r1, 8*i); + andi(_jit, t0, t0, 0xFF); + orr(_jit, r0, r0, t0); + } + unget_temp_gpr(_jit); +} + +static void +bswapr_us(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + bswapr_uany(_jit, r0, r1, 2); +} + +static void +bswapr_ui(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + bswapr_uany(_jit, r0, r1, 4); +} + +# if __WORDSIZE == 64 +static void +bswapr_ul(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + bswapr_uany(_jit, r0, r1, 8); +} +#endif + + + +/* + * Others + * TODO + */ +static void +nop(jit_state_t *_jit, int32_t im) +{ + for (; im > 0; im -= 4) + em_wp(_jit, _NOP()); + assert(im == 0); +} +static void +mfence(jit_state_t *_jit) +{ + // TODO: we may need it for atomic operations? + em_wp(_jit, _FENCE(0xFF)); +} + +static void +breakpoint(jit_state_t *_jit) +{ + em_wp(_jit, _EBREAK()); +} diff --git a/libguile/lightening/lightening/riscv-fpu.c b/libguile/lightening/lightening/riscv-fpu.c new file mode 100644 index 000000000..b4e7546c7 --- /dev/null +++ b/libguile/lightening/lightening/riscv-fpu.c @@ -0,0 +1,883 @@ +/* + * RV32F Standard Extension + */ +#define _FLW(rd, rs1, im) Itype(7, rd, 2, rs1, im) +#define _FSW(rs1, rs2, imm) Stype(39, 2, rs1, rs2, imm) +#define _FMADD_S(rd, rs1, rs2, rs3) R4type(67, rd, 0, rs1, rs2, 0, rs3) +#define _FMSUB_S(rd, rs1, rs2, rs3) R4type(71, rd, 0, rs1, rs2, 0, rs3) +#define _FNMSUB_S(rd, rs1, rs2, rs3) R4type(75, rd, 0, rs1, rs2, 0, rs3) +#define _FNMADD_S(rd, rs1, rs2, rs3) R4type(79, rd, 0, rs1, rs2, 0, rs3) +#define _FADD_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 0) +#define _FSUB_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 4) +#define _FMUL_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 8) +#define _FDIV_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 12) +#define _FSQRT_S(rd, rs1) Rtype(83, rd, 0, rs1, 0, 44) +#define _FSGNJ_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 16) +#define _FSGNJN_S(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 16) +#define _FSGNJX_S(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 16) +#define _FMIN_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 20) +#define _FMAX_S(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 20) +#define _FCVT_W_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 96) +#define _FCVT_WU_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 96) +#define _FMV_X_W(rd, rs1) Rtype(83, rd, 0, rs1, 0, 112) +#define _FEQ_S(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 80) +#define _FLT_S(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 80) +#define _FLE_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 80) +#define _FCLASS_S(rd, rs1) Rtype(83, rd, 1, rs1, 0, 112) +#define _FCVT_S_W(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 104) +#define _FCVT_S_WU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 104) +#define _FMV_W_X(rd, rs1) Rtype(83, rd, 0, rs1, 0, 120) +/* + * RV64F Standard Extension (in addition to RV32F) + */ +#define _FCVT_L_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 96) +#define _FCVT_LU_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 96) +#define _FCVT_S_L(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 104) +#define _FCVT_S_LU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 104) +/* + * RV32D Standard Extension + */ +#define _FLD(rd, rs1, im) Itype(7, rd, 3, rs1, im) +#define _FSD(rs1, rs2, imm) Stype(39, 3, rs1, rs2, imm) +#define _FMADD_D(rd, rs1, rs2, rs3) R4type(67, rd, 0, rs1, rs2, 1, rs3) +#define _FMSUB_D(rd, rs1, rs2, rs3) R4type(71, rd, 0, rs1, rs2, 1, rs3) +#define _FNMSUB_D(rd, rs1, rs2, rs3) R4type(75, rd, 0, rs1, rs2, 1, rs3) +#define _FNMADD_D(rd, rs1, rs2, rs3) R4type(79, rd, 0, rs1, rs2, 1, rs3) +#define _FADD_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 1) +#define _FSUB_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 5) +#define _FMUL_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 9) +#define _FDIV_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 13) +#define _FSQRT_D(rd, rs1) Rtype(83, rd, 0, rs1, 0, 45) +#define _FSGNJ_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 17) +#define _FSGNJN_D(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 17) +#define _FSGNJX_D(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 17) +#define _FMIN_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 21) +#define _FMAX_D(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 21) +#define _FCVT_S_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 32) +#define _FCVT_D_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 33) +#define _FEQ_D(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 81) +#define _FLT_D(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 81) +#define _FLE_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 81) +#define _FCLASS_D(rd, rs1) Rtype(83, rd, 1, rs1, 0, 113) +#define _FCVT_W_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 97) +#define _FCVT_WU_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 97) +#define _FCVT_D_W(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 105) +#define _FCVT_D_WU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 105) +/* + * RV64D Standard Extension (in addition to RV32D) + */ +#define _FCVT_L_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 97) +#define _FCVT_LU_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 97) +#define _FMV_X_D(rd, rs1) Rtype(83, rd, 0, rs1, 0, 113) +#define _FCVT_D_L(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 105) +#define _FCVT_D_LU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 105) +#define _FMV_D_X(rd, rs1) Rtype(83, rd, 0, rs1, 0, 121) +/* + * Pseudo instructions + */ +#define _FMV_S(r0, r1) _FSGNJ_S(r0, r1, r1) +#define _FABS_S(r0, r1) _FSGNJX_S(r0, r1, r1) +#define _FNEG_S(r0, r1) _FSGNJN_S(r0, r1, r1) +#define _FMV_D(r0, r1) _FSGNJ_D(r0, r1, r1) +#define _FABS_D(r0, r1) _FSGNJX_D(r0, r1, r1) +#define _FNEG_D(r0, r1) _FSGNJN_D(r0, r1, r1) + +// Binary ALU operations +static void addr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void mulr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void mulr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); + +// Unary ALU operations +static void sqrtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void sqrtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void negr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void negr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void absr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void absr_d(jit_state_t *_jit, int32_t r0, int32_t r1); + +// Transfer operations +static void movr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movr_i_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movr_l_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1); + +// Argument management +static void retr_f(jit_state_t *_jit, int32_t u); +static void retr_d(jit_state_t *_jit, int32_t u); + +// Load operations +static void ldr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldi_f(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxi_f(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldi_d(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxi_d(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +// Store operations +static void str_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void sti_f(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void stxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxi_f(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +static void sti_d(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void stxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxi_d(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); + +// Branch instructions +static jit_reloc_t bltr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bler_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t beqr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bger_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bner_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunltr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunler_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t buneqr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunger_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bungtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bltgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bordr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunordr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bltr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bler_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t beqr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bger_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bner_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunltr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunler_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t buneqr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunger_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bungtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bltgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bordr_d(jit_state_t *_jit, int32_t r0, int32_t r1); + +/* + * Binary ALU operations + */ +static void +addr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FADD_S(r0, r1, r2)); +} +static void +addr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FADD_D(r0, r1, r2)); +} +static void +subr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FSUB_S(r0, r1, r2)); +} +static void +subr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FSUB_D(r0, r1, r2)); +} +static void +mulr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FMUL_S(r0, r1, r2)); +} +static void +mulr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FMUL_D(r0, r1, r2)); +} +static void +divr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FDIV_S(r0, r1, r2)); +} +static void +divr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FDIV_D(r0, r1, r2)); +} + +/* + * Unary ALU operations + */ +static void +sqrtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSQRT_S(r0, r1)); +} +static void +sqrtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSQRT_D(r0, r1)); +} +static void +negr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FNEG_S(r0, r1)); +} +static void +negr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FNEG_D(r0, r1)); +} +static void +absr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FABS_S(r0, r1)); +} + +static void +absr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FABS_D(r0, r1)); +} + + +/* + * Load operations + */ +static void +ldr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FLW(r0, r1, 0)); +} +static void +ldr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FLD(r0, r1, 0)); +} +static void +ldi_f(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_f(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_f(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxi_f(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _FLW(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_f(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldi_d(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_d(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_d(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldxi_d(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _FLD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_d(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + + + +/* + * Store operations + */ +static void +str_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSW(r0, r1, 0)); +} +static void +str_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSD(r0, r1, 0)); +} +static void +sti_f(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_f(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} +static void +stxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_f(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} +static void +stxi_f(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _FSW(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_f(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} +static void +sti_d(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_d(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} +static void +stxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_d(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} +static void +stxi_d(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _FSD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_d(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + + +/* + * Transfer operations + */ +static void +movr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + if (r0 != r1) + em_wp(_jit, _FMV_S(r0, r1)); +} + +static void +movr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + if (r0 != r1) + em_wp(_jit, _FMV_D(r0, r1)); +} +static void +movr_i_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FMV_X_W(r0, r1)); +} +static void +movr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FMV_W_X(r0, r1)); +} +static void +movr_l_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FMV_X_D(r0, r1)); +} +static void +movr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FMV_D_X(r0, r1)); +} + +static void +truncr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_W_S(r0, r1, 1)); +} +static void +truncr_d_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_W_D(r0, r1, 1)); +} +static void +truncr_f_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_L_S(r0, r1, 1)); +} +static void +truncr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_L_D(r0, r1, 1)); +} + +static void +extr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ +#if __WORDSIZE == 64 + em_wp(_jit, _FCVT_S_L(r0, r1, 0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _FCVT_S_W(r0, r1, 0)); +#endif +} +static void +extr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ +#if __WORDSIZE == 64 + em_wp(_jit, _FCVT_D_L(r0, r1, 0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _FCVT_D_W(r0, r1, 0)); +#endif +} + +static void +extr_f_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_D_S(r0, r1, 0)); +} +static void +extr_d_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_S_D(r0, r1, 0)); +} + +static void +movi_f(jit_state_t *_jit, int32_t r0, jit_float32_t i0) +{ + union { int32_t i; jit_float32_t f; } u = { .f = i0 }; + jit_gpr_t reg = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(reg), u.i); + em_wp(_jit, _FMV_W_X(r0, jit_gpr_regno(reg))); + unget_temp_gpr(_jit); +} +static void +movi_d(jit_state_t *_jit, int32_t r0, jit_float64_t i0) +{ + // TODO: How to move a 64 bit value from a 32 bit X register? + // ATM only works on RV64 + union { int64_t i; jit_float64_t f; } u = { .f = i0 }; + jit_gpr_t reg = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(reg), u.i); + em_wp(_jit, _FMV_D_X(r0, jit_gpr_regno(reg))); + unget_temp_gpr(_jit); +} + + +/* + * Argument management + */ +static void +retval_f(jit_state_t *_jit, int32_t r0) +{ + movr_f(_jit, jit_fpr_regno(_FA0), r0); +} + +static void +retval_d(jit_state_t *_jit, int32_t r0) +{ + movr_d(_jit, jit_fpr_regno(_FA0), r0); +} + +static void +retr_f(jit_state_t *_jit, int32_t u) +{ + movr_f(_jit, jit_fpr_regno(_FA0), u); + ret(_jit); +} + +static void +retr_d(jit_state_t *_jit, int32_t u) +{ + movr_d(_jit, jit_fpr_regno(_FA0), u); + ret(_jit); +} + + +/* + * Branch instructions + */ + +static jit_reloc_t +bltr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_S(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_S(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +beqr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_S(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bler_f(_jit, r1, r0); +} + +static jit_reloc_t +bgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr_f(_jit, r1, r0); +} + +static jit_reloc_t +bner_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_S(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunltr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_S(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunler_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_S(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +buneqr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_S(t0, r0, r1)); + em_wp(_jit, _FLT_S(t1, r1, r0)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunger_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_S(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bungtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_S(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_S(t0, r1, r0)); + em_wp(_jit, _FLT_S(t1, r0, r1)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bordr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_S(t0, r0, r0)); + em_wp(_jit, _FEQ_S(t1, r1, r1)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunordr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_S(t0, r1, r1)); + em_wp(_jit, _FEQ_S(t1, r0, r0)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_D(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_D(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +beqr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_D(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bler_d(_jit, r1, r0); +} + +static jit_reloc_t +bgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr_d(_jit, r1, r0); +} + +static jit_reloc_t +bner_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_D(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunltr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_D(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunler_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_D(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +buneqr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_D(t0, r0, r1)); + em_wp(_jit, _FLT_D(t1, r1, r0)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunger_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_D(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bungtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_D(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_D(t0, r1, r0)); + em_wp(_jit, _FLT_D(t1, r0, r1)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bordr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_D(t0, r0, r0)); + em_wp(_jit, _FEQ_D(t1, r1, r1)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunordr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_D(t0, r1, r1)); + em_wp(_jit, _FEQ_D(t1, r0, r0)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} diff --git a/libguile/lightening/lightening/riscv.c b/libguile/lightening/lightening/riscv.c new file mode 100644 index 000000000..d3e4efaa3 --- /dev/null +++ b/libguile/lightening/lightening/riscv.c @@ -0,0 +1,341 @@ +/* + * Copyright (C) 2021-2024 Free Software Foundation, Inc. + * + * This file is part of GNU lightning. + * + * GNU lightning is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU lightning is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * Authors: + * Ekaitz Zarraga + */ + +#include "riscv-cpu.c" +#include "riscv-fpu.c" + +static const jit_gpr_t abi_gpr_args[] = { + _A0, _A1, _A2, _A3, _A4, _A5, _A6, _A7 +}; +static const jit_fpr_t abi_fpr_args[] = { + _FA0, _FA1, _FA2, _FA3, _FA4, _FA5, _FA6, _FA7 +}; +static const int abi_gpr_arg_count = sizeof(abi_gpr_args) / sizeof(abi_gpr_args[0]); +static const int abi_fpr_arg_count = sizeof(abi_fpr_args) / sizeof(abi_fpr_args[0]); + +struct abi_arg_iterator +{ + const jit_operand_t *args; + size_t argc; + + size_t arg_idx; + size_t gpr_idx; + size_t fpr_idx; + uint32_t vfp_used_registers; + size_t stack_size; + size_t stack_padding; +}; + +static size_t page_size; + +jit_bool_t +jit_get_cpu(void) +{ + page_size = sysconf(_SC_PAGE_SIZE); + // FIXME check version, extensions, hardware fp support + // + // List of macro definitions for riscv support: + // ------------------------------------------- + // __riscv: defined for any RISC-V target. Older versions of the GCC + // toolchain defined __riscv__. + // + // __riscv_xlen: 32 for RV32 and 64 for RV64. + // + // __riscv_float_abi_soft, __riscv_float_abi_single, + // __riscv_float_abi_double: one of these three will be defined, depending on + // target ABI. + // + // __riscv_cmodel_medlow, __riscv_cmodel_medany: one of these two will be + // defined, depending on the target code model. + // + // __riscv_mul: defined when targeting the 'M' ISA extension. + // + // __riscv_muldiv: defined when targeting the 'M' ISA extension and -mno-div + // has not been used. + // + // __riscv_div: defined when targeting the 'M' ISA extension and -mno-div has + // not been used. + // + // __riscv_atomic: defined when targeting the 'A' ISA extension. + // + // __riscv_flen: 32 when targeting the 'F' ISA extension (but not 'D') and 64 + // when targeting 'FD'. + // + // __riscv_fdiv: defined when targeting the 'F' or 'D' ISA extensions and + // -mno-fdiv has not been used. + // + // __riscv_fsqrt: defined when targeting the 'F' or 'D' ISA extensions and + // -mno-fdiv has not been used. + // + // __riscv_compressed: defined when targeting the 'C' ISA extension. + return 1; +} + +jit_bool_t +jit_init(jit_state_t *_jit) +{ + return 1; +} + +static size_t +jit_initial_frame_size (void) +{ + return 0; +} + +static void +reset_abi_arg_iterator(struct abi_arg_iterator *iter, size_t argc, + const jit_operand_t *args) +{ + memset(iter, 0, sizeof *iter); + iter->argc = argc; + iter->args = args; +} + +static void +next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg) +{ + // RISC-V Calling convention: + // https://riscv.org/wp-content/uploads/2015/01/riscv-calling.pdf + // + // The RISC-V calling convention passes arguments in registers when possible. + // Up to eight integer registers, a0–a7, and up to eight floating-point + // registers, fa0–fa7, are used for this purpose. + // + // If argument i < 8 is a floating-point type, it is passed in floating-point + // register fai; otherwise, it is passed in integer register ai. + + ASSERT(iter->arg_idx < iter->argc); + enum jit_operand_abi abi = iter->args[iter->arg_idx].abi; + iter->arg_idx++; + if (is_gpr_arg(abi) && iter->gpr_idx < abi_gpr_arg_count) { + *arg = jit_operand_gpr (abi, abi_gpr_args[iter->gpr_idx++]); + return; + } + if (is_fpr_arg(abi) && iter->fpr_idx < abi_fpr_arg_count) { + *arg = jit_operand_fpr (abi, abi_fpr_args[iter->fpr_idx++]); + return; + } else if (is_fpr_arg(abi) && iter->gpr_idx < abi_gpr_arg_count) { + *arg = jit_operand_gpr (abi, abi_gpr_args[iter->gpr_idx++]); + return; + } + *arg = jit_operand_mem (abi, JIT_SP, iter->stack_size); +#if __WORDSIZE == 32 + iter->stack_size += 4 + (abi == JIT_OPERAND_ABI_DOUBLE ? 4 : 0); +#elif __WORDSIZE == 64 + iter->stack_size += 8; +#endif +} + +static void +jit_flush(void *fptr, void *tptr) +{ + jit_word_t f = (jit_word_t)fptr & -page_size; + jit_word_t t = (((jit_word_t)tptr) + page_size - 1) & -page_size; + __clear_cache((void *)f, (void *)t); +} + +static inline size_t +jit_stack_alignment(void) +{ + return 16; + // NOTE: See: https://github.com/riscv/riscv-gcc/issues/61 +} + +static void +jit_try_shorten(jit_state_t *_jit, jit_reloc_t reloc, jit_pointer_t addr) +{ +} + +static void* +bless_function_pointer(void *ptr) +{ + return ptr; +} + +static jit_gpr_t +get_callr_temp (jit_state_t * _jit) +{ + return _RA; +} + +/* + * Veneers + */ +struct veneer{ + instr_t auipc; + instr_t load; // `ld` in RV64 and `lw` in RV32 + instr_t jalr; +#if __WORDSIZE == 64 + uint32_t padding; + uint64_t address; +#elif __WORDSIZE == 32 + uint32_t address; +#endif +}; + +static void +emit_veneer(jit_state_t *_jit, jit_pointer_t target) +{ + // We need to generate something like this: + // ---------------------------------------------- + // 32 bits: | 64 bits: + // auipc t0, 0 | auipc t0, 0 + // ld t0, 12(t0) | ld t0, 16(t0) + // jalr zero, 0(t0) | jalr zero, 0(t0) + // ADDRESS_LITERAL | .byte 0x00, 0x00, 0x00, 0x00 (padding) + // | ADDRESS_LITERAL + // + jit_gpr_t t0 = get_temp_gpr(_jit); + emit_u32(_jit, _AUIPC(jit_gpr_regno(t0), 0)); +#if __WORDSIZE == 64 + emit_u32(_jit, _LD(jit_gpr_regno(t0), jit_gpr_regno(t0), 16)); +#elif __WORDSIZE == 32 + emit_u32(_jit, _LW(jit_gpr_regno(t0), jit_gpr_regno(t0), 12)); +#endif + emit_u32(_jit, _JALR(jit_gpr_regno(_ZERO), jit_gpr_regno(t0), 0)); +#if __WORDSIZE == 64 + emit_u32(_jit, 0); // Padding + emit_u64(_jit, (uint64_t) target); +#elif __WORDSIZE == 32 + emit_u32(_jit, (uint32_t) target); +#endif + unget_temp_gpr(_jit); +} + +static void +patch_veneer(uint32_t *loc, jit_pointer_t addr) +{ + struct veneer *v = (struct veneer*) loc; +#if __WORDSIZE == 64 + v->address = (uint64_t) addr; +#elif __WORDSIZE == 32 + v->address = (uint32_t) addr; +#endif +} + + +/* + * Conditional jumps + */ +static void +patch_jcc_offset(uint32_t *loc, ptrdiff_t v) +{ + instr_t *i = (instr_t *) loc; + i->w = patch_cc_jump(i->w, v); +} +static void +patch_veneer_jcc_offset(uint32_t *loc, ptrdiff_t offset){ + patch_jcc_offset(loc, offset); +} + +static int32_t +read_jcc_offset(uint32_t *loc) +{ + instr_t i; + i.w = *loc; + + int32_t offset = i.B.imm12 << 31; + offset >>= 20; + offset |= (i.B.imm11 << 11); + offset |= (i.B.imm10_5 << 5); + offset |= (i.B.imm4_1 << 1); + + return offset; +} +static int +offset_in_jcc_range(ptrdiff_t offset, int flags) +{ + if(offset & 1) + return 0; + else + return simm12_p(offset >> 1); +} + +/* + * Unconditional jumps + */ +static int32_t read_jmp_offset(uint32_t *loc) +{ + instr_t i; + i.w = *loc; + + int32_t offset = i.J.imm20 << 31; + offset >>= 12; + offset |= (i.J.imm19_12 << 12); + offset |= (i.J.imm11 << 11); + offset |= (i.J.imm10_1 << 1); + return offset; +} +static int +offset_in_jmp_range(ptrdiff_t offset, int flags) +{ + if(offset & 1) + return 0; + else + return simm20_p(offset >> 1); +} + +static void +patch_jmp_offset(uint32_t *loc, ptrdiff_t v) +{ + instr_t *i = (instr_t *) loc; + i->w = patch_jump(i->w, v); +} + +static void +patch_veneer_jmp_offset(uint32_t *loc, ptrdiff_t offset) +{ + patch_jmp_offset(loc, offset); +} + + +/* + * Jumps around the veneer + */ +static void +patch_jmp_without_veneer(jit_state_t *_jit, uint32_t *loc) +{ + patch_jmp_offset(loc, _jit->pc.uw - (uintptr_t)loc); +} +static uint32_t* +jmp_without_veneer(jit_state_t *_jit) +{ + uint32_t *loc = _jit->pc.ui; + emit_u32(_jit, _JAL(jit_gpr_regno(_ZERO), 0)); + return loc; +} + + +/* + * Load from pool offset + */ +static void +patch_load_from_pool_offset(uint32_t *loc, int32_t v) +{ + load_from_pool_t *i = (load_from_pool_t *) loc; + i->l = patch_load_from_pool(i->l, v); +} +static int32_t +read_load_from_pool_offset(uint32_t *loc) +{ + load_from_pool_t *i = (load_from_pool_t*) loc; + return i->inst.auipc.U.imm31_12 + i->inst.load.I.imm11_0; +} + diff --git a/libguile/lightening/lightening/riscv.h b/libguile/lightening/lightening/riscv.h new file mode 100644 index 000000000..173216655 --- /dev/null +++ b/libguile/lightening/lightening/riscv.h @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2021-2024 Free Software Foundation, Inc. + * + * This file is part of GNU lightning. + * + * GNU lightning is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU lightning is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * Authors: + * Ekaitz Zarraga + */ + +#ifndef _jit_riscv_h +#define _jit_riscv_h + +#define JIT_NEEDS_LITERAL_POOL 1 + +// x registers +// Special registers +#define _RA JIT_GPR(1) // Return address +#define _SP JIT_GPR(2) // Stack pointer +#define _GP JIT_GPR(3) // Global pointer +#define _TP JIT_GPR(4) // Thread pointer +#define _FP JIT_GPR(8) // Frame pointer +#define _ZERO JIT_GPR(0) // Always zero +// Argument passing +#define _A0 JIT_GPR(10) +#define _A1 JIT_GPR(11) +#define _A2 JIT_GPR(12) +#define _A3 JIT_GPR(13) +#define _A4 JIT_GPR(14) +#define _A5 JIT_GPR(15) +#define _A6 JIT_GPR(16) +#define _A7 JIT_GPR(17) +// Saved registers +#define _S0 _FP // S0 is the frame pointer normally +#define _S1 JIT_GPR(9) +#define _S2 JIT_GPR(18) +#define _S3 JIT_GPR(19) +#define _S4 JIT_GPR(20) +#define _S5 JIT_GPR(21) +#define _S6 JIT_GPR(22) +#define _S7 JIT_GPR(23) +#define _S8 JIT_GPR(24) +#define _S9 JIT_GPR(25) +#define _S10 JIT_GPR(26) +#define _S11 JIT_GPR(27) +// Temporaries +#define _T0 JIT_GPR(5) +#define _T1 JIT_GPR(6) +#define _T2 JIT_GPR(7) +#define _T3 JIT_GPR(28) +#define _T4 JIT_GPR(29) +#define _T5 JIT_GPR(30) +#define _T6 JIT_GPR(31) + +// f registers +// Termporaries +#define _FT0 JIT_FPR(0) +#define _FT1 JIT_FPR(1) +#define _FT2 JIT_FPR(2) +#define _FT3 JIT_FPR(3) +#define _FT4 JIT_FPR(4) +#define _FT5 JIT_FPR(5) +#define _FT6 JIT_FPR(6) +#define _FT7 JIT_FPR(7) +#define _FT8 JIT_FPR(28) +#define _FT9 JIT_FPR(29) +#define _FT10 JIT_FPR(30) +#define _FT11 JIT_FPR(31) +// Saved registers +#define _FS0 JIT_FPR(8) +#define _FS1 JIT_FPR(9) +#define _FS2 JIT_FPR(18) +#define _FS3 JIT_FPR(19) +#define _FS4 JIT_FPR(20) +#define _FS5 JIT_FPR(21) +#define _FS6 JIT_FPR(22) +#define _FS7 JIT_FPR(23) +#define _FS8 JIT_FPR(24) +#define _FS9 JIT_FPR(25) +#define _FS10 JIT_FPR(26) +#define _FS11 JIT_FPR(27) +// Argument passing +#define _FA0 JIT_FPR(10) +#define _FA1 JIT_FPR(11) +#define _FA2 JIT_FPR(12) +#define _FA3 JIT_FPR(13) +#define _FA4 JIT_FPR(14) +#define _FA5 JIT_FPR(15) +#define _FA6 JIT_FPR(16) +#define _FA7 JIT_FPR(17) + + +// JIT Registers +// ---------------------------------------------------------------------- +// Caller-save registers JIT_R${NUM} +// Callee-save registers JIT_V${NUM} +// Caller-save temporary registers JIT_TMP${NUM} +// Caller-save floating point registers JIT_F${NUM} +// Callee-save floating point registers JIT_VF${NUM} +// Caller-save floating point temporary registers JIT_FTMP${NUM} + +// Caller-save registers +#define JIT_R0 _A0 +#define JIT_R1 _A1 +#define JIT_R2 _A2 +#define JIT_R3 _A3 +#define JIT_R4 _A4 +#define JIT_R5 _A5 +#define JIT_R6 _A6 +#define JIT_R7 _A7 + +// Use this as a CARRY +#define JIT_CARRY _T0 +#define JIT_TMP0 _T1 +#define JIT_TMP1 _T2 +#define JIT_TMP2 _T3 + +#define JIT_TMP3 _T4 +// Temporaries +#define JIT_TMP4 _T5 +#define JIT_TMP5 _T6 + +// Callee-save registers +#define JIT_V0 _S1 +#define JIT_V1 _S2 +#define JIT_V2 _S3 +#define JIT_V3 _S4 +#define JIT_V4 _S5 +#define JIT_V5 _S6 +#define JIT_V6 _S7 +#define JIT_V7 _S8 +#define JIT_V8 _S9 +#define JIT_V9 _S10 +#define JIT_V10 _S11 + + +// Callee-save floating point registers +#define JIT_VF0 _FS0 +#define JIT_VF1 _FS1 +#define JIT_VF2 _FS2 +#define JIT_VF3 _FS3 +#define JIT_VF4 _FS4 +#define JIT_VF5 _FS5 +#define JIT_VF6 _FS6 +#define JIT_VF7 _FS7 +#define JIT_VF8 _FS8 +#define JIT_VF9 _FS9 +#define JIT_VF10 _FS10 +#define JIT_VF11 _FS11 + +// Caller save floating point registers +#define JIT_F0 _FA0 +#define JIT_F1 _FA1 +#define JIT_F2 _FA2 +#define JIT_F3 _FA3 +#define JIT_F4 _FA4 +#define JIT_F5 _FA5 +#define JIT_F6 _FA6 +#define JIT_F7 _FA7 +// NOTE: These are temporaries, but we can use them as general purpose +// registers as there's only one temporary JIT_FTMP supported by lightening.c +#define JIT_F8 _FT0 +#define JIT_F9 _FT1 +#define JIT_F10 _FT2 +#define JIT_F11 _FT3 +#define JIT_F12 _FT4 +#define JIT_F13 _FT5 +#define JIT_F14 _FT6 +#define JIT_F15 _FT7 +#define JIT_F16 _FT8 +#define JIT_F17 _FT9 +#define JIT_F18 _FT10 + +// Floating point temporary register +#define JIT_FTMP _FT11 + +// Special purpose registers +#define JIT_FP _FP +#define JIT_LR _RA +#define JIT_SP _SP + +// TODO: Make sure this is correct +#define JIT_PLATFORM_CALLEE_SAVE_GPRS JIT_LR + +#endif diff --git a/libguile/lightening/lightening/x86-sse.c b/libguile/lightening/lightening/x86-sse.c index ab66dc7c5..0331ff056 100644 --- a/libguile/lightening/lightening/x86-sse.c +++ b/libguile/lightening/lightening/x86-sse.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2017, 2019 Free Software Foundation, Inc. + * Copyright (C) 2012-2017, 2019, 2025 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -128,13 +128,24 @@ movdlxr(jit_state_t *_jit, int32_t r0, int32_t r1) { ssexr(_jit, 0x66, X86_SSE_X2G, r0, r1); } +static void +movdlrx(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + ssexr(_jit, 0x66, X86_SSE_G2X, r0, r1); +} static void movdqxr(jit_state_t *_jit, int32_t r0, int32_t r1) maybe_unused; +static void movdqrx(jit_state_t *_jit, int32_t r0, int32_t r1) maybe_unused; static void movdqxr(jit_state_t *_jit, int32_t r0, int32_t r1) { sselxr(_jit, 0x66, X86_SSE_X2G, r0, r1); } +static void +movdqrx(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + sselxr(_jit, 0x66, X86_SSE_G2X, r0, r1); +} static void movssmr(jit_state_t *_jit, int32_t md, int32_t rb, int32_t ri, int32_t ms, int32_t rd) @@ -171,6 +182,29 @@ movr_d(jit_state_t *_jit, int32_t r0, int32_t r1) ssexr(_jit, 0xf2, X86_SSE_MOV, r0, r1); } +static void +movr_i_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + movdlrx(_jit, r0, r1); +} +static void +movr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + movdlxr(_jit, r0, r1); +} +#if __X64 +static void +movr_l_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + movdqrx(_jit, r0, r1); +} +static void +movr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + movdqxr(_jit, r0, r1); +} +#endif + static void addssr(jit_state_t *_jit, int32_t r0, int32_t r1) { diff --git a/libguile/lightening/lightening/x86.c b/libguile/lightening/lightening/x86.c index f8ac4b0b8..873cb27a4 100644 --- a/libguile/lightening/lightening/x86.c +++ b/libguile/lightening/lightening/x86.c @@ -405,3 +405,9 @@ bless_function_pointer(void *ptr) { return ptr; } + +static jit_gpr_t +get_callr_temp (jit_state_t * _jit) +{ + return _RAX; +} diff --git a/libguile/lightening/tests/Makefile b/libguile/lightening/tests/Makefile index 769b43423..271f3e403 100644 --- a/libguile/lightening/tests/Makefile +++ b/libguile/lightening/tests/Makefile @@ -1,5 +1,5 @@ -TESTS=$(sort $(basename $(wildcard *.c))) -TARGETS ?= native ia32 aarch64 armv7 +TESTS ?= $(sort $(basename $(wildcard *.c))) +TARGETS ?= native ia32 aarch64 armv7 riscv # Suitable values of cross-compiler variables for Debian: # @@ -14,13 +14,15 @@ TARGETS ?= native ia32 aarch64 armv7 # libc6-dev:amd64 gcc make \ # qemu binfmt-support qemu-user-static \ # gcc-i686-linux-gnu libc6-dev-i386-cross libc6:i386 \ -# gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6:arm64 +# gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6:arm64\ +# gcc-riscv64-linux-gnu libc6-dev-riscv64-cross libc6:riscv64 # CC = gcc -CC_IA32=guix environment --pure -s i686-linux --ad-hoc gcc-toolchain -- gcc -CC_AARCH64=guix environment --pure -s aarch64-linux --ad-hoc gcc-toolchain -- gcc -CC_ARMv7=guix environment --pure -s armhf-linux --ad-hoc gcc-toolchain -- gcc -CFLAGS = -Wall -O0 -g +CC_IA32 ?= guix environment --pure -s i686-linux --ad-hoc gcc-toolchain -- gcc +CC_AARCH64 ?= guix environment --pure -s aarch64-linux --ad-hoc gcc-toolchain -- gcc +CC_ARMv7 ?= guix environment --pure -s armhf-linux --ad-hoc gcc-toolchain -- gcc +CC_RISCV ?= guix environment --pure -s riscv64-linux --ad-hoc gcc-toolchain -- gcc +CFLAGS ?= -Wall -O0 -g all: $(foreach TARGET,$(TARGETS),$(addprefix test-$(TARGET)-,$(TESTS))) @@ -54,6 +56,10 @@ test-armv7-%: CC = $(CC_ARMv7) test-armv7-%: %.c lightening-armv7.o test.h $(CC) $(CFLAGS) $(CPPFLAGS) -I.. -o $@ lightening-armv7.o $< +test-riscv-%: CC = $(CC_RISCV) +test-riscv-%: %.c lightening-riscv.o test.h + $(CC) $(CFLAGS) $(CPPFLAGS) -I.. -o $@ lightening-riscv.o $< + .PRECIOUS: $(foreach TARGET,$(TARGETS),$(addprefix test-$(TARGET)-,$(TESTS))) .PRECIOUS: $(foreach TARGET,$(TARGETS),lightening-$(TARGET).o) diff --git a/libguile/lightening/tests/call_10_2.c b/libguile/lightening/tests/call_10_2.c new file mode 100644 index 000000000..189757876 --- /dev/null +++ b/libguile/lightening/tests/call_10_2.c @@ -0,0 +1,165 @@ +#include "test.h" +#include "regarrays.inc" + +#define DEFINE_TEST_INT(ABI_TYPE, TYPE, LIT, NEGATE) \ +static TYPE \ +check_##TYPE (TYPE a, TYPE b, TYPE c, TYPE d, TYPE e, \ + TYPE f, TYPE g, TYPE h, TYPE i, TYPE j) \ +{ \ + ASSERT(a == LIT(0)); \ + ASSERT(b == NEGATE(1)); \ + ASSERT(c == LIT(2)); \ + ASSERT(d == NEGATE(3)); \ + ASSERT(e == LIT(4)); \ + ASSERT(f == NEGATE(5)); \ + ASSERT(g == LIT(6)); \ + ASSERT(h == NEGATE(7)); \ + ASSERT(i == LIT(8)); \ + ASSERT(j == NEGATE(9)); \ + return LIT(42); \ +} \ + \ +static void \ +run_test_##TYPE (jit_state_t *j, uint8_t *arena_base, size_t arena_size, \ + jit_gpr_t base) \ +{ \ + jit_begin(j, arena_base, arena_size); \ + size_t align = jit_enter_jit_abi(j, v_count, 0, 0); \ + jit_load_args_1(j, jit_operand_gpr (JIT_OPERAND_ABI_POINTER, base)); \ + \ + jit_operand_t args[10] = { \ + jit_operand_mem(ABI_TYPE, base, 0 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 1 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 2 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 3 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 4 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 5 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 6 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 7 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 8 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 9 * sizeof(TYPE)), \ + }; \ + jit_calli(j, check_##TYPE, 10, args); \ + jit_leave_jit_abi(j, v_count, 0, align); \ + jit_ret(j); \ + \ + size_t size = 0; \ + void* ret = jit_end(j, &size); \ + \ + TYPE (*f)(TYPE*) = ret; \ + \ + TYPE iargs[10] = { LIT(0), NEGATE(1), LIT(2), NEGATE(3), LIT(4), \ + NEGATE(5), LIT(6), NEGATE(7), LIT(8), NEGATE(9) }; \ + ASSERT(f(iargs) == LIT(42)); \ +} + +#define LIT(X) (X) +#define NEGATE(X) (-X) +DEFINE_TEST_INT(JIT_OPERAND_ABI_INT32, int32_t, LIT, NEGATE); +#if (UINTPTR_MAX == UINT64_MAX) +DEFINE_TEST_INT(JIT_OPERAND_ABI_INT64, int64_t, LIT, NEGATE); +#endif +#undef NEGATE + +#define NEGATE(X) (~X) +DEFINE_TEST_INT(JIT_OPERAND_ABI_UINT32, uint32_t, LIT, NEGATE); +#if (UINTPTR_MAX == UINT64_MAX) +DEFINE_TEST_INT(JIT_OPERAND_ABI_UINT64, uint64_t, LIT, NEGATE); +#endif +#undef NEGATE +#undef LIT + +typedef uint8_t* ptr_t; +#define LIT(X) ((ptr_t)(uintptr_t)(X)) +#define NEGATE(X) ((ptr_t)(~(uintptr_t)(X))) +DEFINE_TEST_INT(JIT_OPERAND_ABI_POINTER, ptr_t, LIT, NEGATE); + +static double +check_double (double a, double b, double c, double d, double e, + double f, double g, double h, double i, double j) +{ + ASSERT(a == 0.0); + ASSERT(b == -1.0); + ASSERT(c == -0xfffffffffffffp+100l); + ASSERT(d == +0xfffffffffffffp-100l); + ASSERT(e == -0xfffffffffffffp+101l); + ASSERT(f == +0xfffffffffffffp-102l); + ASSERT(g == -0xfffffffffffffp+102l); + ASSERT(h == +0xfffffffffffffp-103l); + ASSERT(i == -0xfffffffffffffp+103l); + ASSERT(j == +0xfffffffffffffp-104l); + return 42; +} + +static void +run_test_double (jit_state_t *j, uint8_t *arena_base, size_t arena_size, + jit_gpr_t base) +{ + double dargs[10] = { + 0.0, + -1.0, + -0xfffffffffffffp+100l, + +0xfffffffffffffp-100l, + -0xfffffffffffffp+101l, + +0xfffffffffffffp-102l, + -0xfffffffffffffp+102l, + +0xfffffffffffffp-103l, + -0xfffffffffffffp+103l, + +0xfffffffffffffp-104l, + }; + jit_begin(j, arena_base, arena_size); + size_t align = jit_enter_jit_abi(j, v_count, 0, 0); + jit_load_args_1(j, jit_operand_gpr (JIT_OPERAND_ABI_POINTER, base)); + enum jit_operand_abi abi = JIT_OPERAND_ABI_DOUBLE; + jit_movi_d(j, JIT_F0, dargs[0]); + jit_movi_d(j, JIT_F1, dargs[1]); + jit_movi_d(j, JIT_F2, dargs[2]); + jit_movi_d(j, JIT_F3, dargs[3]); + jit_movi_d(j, JIT_F4, dargs[4]); + jit_movi_d(j, JIT_F5, dargs[5]); + jit_movi_d(j, JIT_F6, dargs[6]); + jit_operand_t args[10] = { + jit_operand_fpr(abi, JIT_F0), + jit_operand_fpr(abi, JIT_F1), + jit_operand_fpr(abi, JIT_F2), + jit_operand_fpr(abi, JIT_F3), + jit_operand_fpr(abi, JIT_F4), + jit_operand_fpr(abi, JIT_F5), + jit_operand_fpr(abi, JIT_F6), + jit_operand_mem(abi, base, 7 * sizeof(double)), + jit_operand_mem(abi, base, 8 * sizeof(double)), + jit_operand_mem(abi, base, 9 * sizeof(double)), + }; + jit_calli(j, check_double, 10, args); + jit_leave_jit_abi(j, v_count, 0, align); + jit_ret(j); + + size_t size = 0; + void* ret = jit_end(j, &size); + + double (*f)(double*) = ret; + + ASSERT(f(dargs) == 42); +} + +static void +run_test (jit_state_t * j, uint8_t * arena_base, size_t arena_size) +{ + for (unsigned i = 0; i < gpr_count; i++) + { + run_test_int32_t (j, arena_base, arena_size, gpr_ref (i)); + run_test_uint32_t (j, arena_base, arena_size, gpr_ref (i)); +#if (UINTPTR_MAX == UINT64_MAX) + run_test_int64_t (j, arena_base, arena_size, gpr_ref (i)); + run_test_uint64_t (j, arena_base, arena_size, gpr_ref (i)); +#endif + run_test_ptr_t (j, arena_base, arena_size, gpr_ref (i)); + run_test_double (j, arena_base, arena_size, gpr_ref (i)); + } +} + +int +main (int argc, char *argv[]) +{ + return main_helper(argc, argv, run_test); +} diff --git a/libguile/lightening/tests/callr_10.c b/libguile/lightening/tests/callr_10.c new file mode 100644 index 000000000..bca488c75 --- /dev/null +++ b/libguile/lightening/tests/callr_10.c @@ -0,0 +1,66 @@ +#include "test.h" +#include "regarrays.inc" + +static int32_t f(int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, + int32_t f, int32_t g, int32_t h, int32_t i, int32_t j) { + ASSERT(a == 0); + ASSERT(b == 1); + ASSERT(c == 2); + ASSERT(d == 3); + ASSERT(e == 4); + ASSERT(f == 5); + ASSERT(g == 6); + ASSERT(h == 7); + ASSERT(i == 8); + ASSERT(j == 9); + return 42; +} + +static void +run_test_2 (jit_state_t *j, uint8_t *arena_base, size_t arena_size, + jit_gpr_t base, jit_gpr_t fun) +{ + jit_begin(j, arena_base, arena_size); + size_t align = jit_enter_jit_abi(j, v_count, 0, 0); + jit_load_args_1(j, jit_operand_gpr (JIT_OPERAND_ABI_POINTER, base)); + + jit_operand_t args[10] = { + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 0 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 1 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 2 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 3 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 4 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 5 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 6 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 7 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 8 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 9 * sizeof(int32_t)) + }; + jit_movi(j, fun, (uintptr_t)f); + jit_callr(j, fun, 10, args); + jit_leave_jit_abi(j, v_count, 0, align); + jit_ret(j); + + size_t size = 0; + void* ret = jit_end(j, &size); + + int32_t (*f)(int32_t*) = ret; + + int32_t iargs[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + ASSERT(f(iargs) == 42); +} + +static void +run_test (jit_state_t *jit, uint8_t *arena_base, size_t arena_size) +{ + for (unsigned i = 0; i < gpr_count; i++) + for (unsigned j = 0; j < gpr_count; j++) + if (i != j) + run_test_2 (jit, arena_base, arena_size, gpr_ref(i), gpr_ref(j)); +} + +int +main (int argc, char *argv[]) +{ + return main_helper(argc, argv, run_test); +} diff --git a/libguile/lightening/tests/movr_dl.c b/libguile/lightening/tests/movr_dl.c new file mode 100644 index 000000000..029d41bcc --- /dev/null +++ b/libguile/lightening/tests/movr_dl.c @@ -0,0 +1,26 @@ +#include "test.h" + +static void +run_test(jit_state_t *j, uint8_t *arena_base, size_t arena_size) +{ +#if __WORDSIZE > 32 + jit_begin(j, arena_base, arena_size); + size_t align = jit_enter_jit_abi(j, 0, 0, 0); + + jit_movi_d(j, JIT_F0, 3.14159); + jit_movr_l_d(j, JIT_R0, JIT_F0); + jit_movr_d_l(j, JIT_F1, JIT_R0); + jit_leave_jit_abi(j, 0, 0, align); + jit_retr_d(j, JIT_F1); + + double (*f)(void) = jit_end(j, NULL); + + ASSERT(f() == 3.14159); +#endif +} + +int +main (int argc, char *argv[]) +{ + return main_helper(argc, argv, run_test); +} diff --git a/libguile/lightening/tests/movr_fi.c b/libguile/lightening/tests/movr_fi.c new file mode 100644 index 000000000..f8d3bdf4c --- /dev/null +++ b/libguile/lightening/tests/movr_fi.c @@ -0,0 +1,24 @@ +#include "test.h" + +static void +run_test(jit_state_t *j, uint8_t *arena_base, size_t arena_size) +{ + jit_begin(j, arena_base, arena_size); + size_t align = jit_enter_jit_abi(j, 0, 0, 0); + + jit_movi_f(j, JIT_F0, 3.14159); + jit_movr_i_f(j, JIT_R0, JIT_F0); + jit_movr_f_i(j, JIT_F1, JIT_R0); + jit_leave_jit_abi(j, 0, 0, align); + jit_retr_f(j, JIT_F1); + + float (*f)(void) = jit_end(j, NULL); + + ASSERT(f() == 3.14159f); +} + +int +main (int argc, char *argv[]) +{ + return main_helper(argc, argv, run_test); +} diff --git a/libguile/lightening/tests/regarrays.inc b/libguile/lightening/tests/regarrays.inc new file mode 100644 index 000000000..de56c905c --- /dev/null +++ b/libguile/lightening/tests/regarrays.inc @@ -0,0 +1,206 @@ +/* Arrays describing the available user registers. -*- mode: c -*- */ + +// #ifdef orgy factored out to common include file + +static const jit_gpr_t rregs[] = { + JIT_R0, + JIT_R1, + JIT_R2, +#ifdef JIT_R3 + JIT_R3, +#endif +#ifdef JIT_R4 + JIT_R4, +#endif +#ifdef JIT_R5 + JIT_R5, +#endif +#ifdef JIT_R6 + JIT_R6, +#endif +#ifdef JIT_R7 + JIT_R7, +#endif +#ifdef JIT_R8 + JIT_R8, +#endif +#ifdef JIT_R9 + JIT_R9, +#endif +#ifdef JIT_R10 + JIT_R10, +#endif +#ifdef JIT_R11 + JIT_R11, +#endif +#ifdef JIT_R12 + JIT_R12, +#endif +#ifdef JIT_R13 + JIT_R13, +#endif +#ifdef JIT_R14 + JIT_R14, +#endif +#ifdef JIT_R15 + JIT_R15, +#endif +#ifdef JIT_R16 + JIT_R16, +#endif +}; + +static const jit_gpr_t vregs[] = { + JIT_V0, JIT_V1, JIT_V2, +#ifdef JIT_V3 + JIT_V3, +#endif +#ifdef JIT_V4 + JIT_V4, +#endif +#ifdef JIT_V5 + JIT_V5, +#endif +#ifdef JIT_V6 + JIT_V6, +#endif +#ifdef JIT_V7 + JIT_V7, +#endif +#ifdef JIT_V8 + JIT_V8, +#endif +#ifdef JIT_V9 + JIT_V9, +#endif +#ifdef JIT_V10 + JIT_V10, +#endif +#ifdef JIT_V11 + JIT_V11, +#endif +#ifdef JIT_V12 + JIT_V12, +#endif +#ifdef JIT_V13 + JIT_V13, +#endif +#ifdef JIT_V14 + JIT_V14, +#endif +#ifdef JIT_V15 + JIT_V15, +#endif +#ifdef JIT_V16 + JIT_V16, +#endif +}; + +static const jit_fpr_t fregs[] = { + JIT_F0, JIT_F1, JIT_F2, + JIT_F2, JIT_F3, JIT_F4, +#ifdef JIT_F7 + JIT_F7, +#endif +#ifdef JIT_F8 + JIT_F8, +#endif +#ifdef JIT_F9 + JIT_F9, +#endif +#ifdef JIT_F10 + JIT_F10, +#endif +#ifdef JIT_F11 + JIT_F11, +#endif +#ifdef JIT_F12 + JIT_F12, +#endif +#ifdef JIT_F13 + JIT_F13, +#endif +#ifdef JIT_F14 + JIT_F14, +#endif +#ifdef JIT_F15 + JIT_F15, +#endif +#ifdef JIT_F16 + JIT_F16, +#endif +}; + +static const jit_fpr_t vfregs[] = { +#ifdef JIT_VF0 + JIT_VF0, +#endif +#ifdef JIT_VF1 + JIT_VF1, +#endif +#ifdef JIT_VF2 + JIT_VF2, +#endif +#ifdef JIT_VF2 + JIT_VF2, +#endif +#ifdef JIT_VF3 + JIT_VF3, +#endif +#ifdef JIT_VF4 + JIT_VF4, +#endif +#ifdef JIT_VF5 + JIT_VF5, +#endif +#ifdef JIT_VF6 + JIT_VF6, +#endif +#ifdef JIT_VF7 + JIT_VF7, +#endif +#ifdef JIT_VF8 + JIT_VF8, +#endif +#ifdef JIT_VF9 + JIT_VF9, +#endif +#ifdef JIT_VF10 + JIT_VF10, +#endif +#ifdef JIT_VF11 + JIT_VF11, +#endif +#ifdef JIT_VF12 + JIT_VF12, +#endif +#ifdef JIT_VF13 + JIT_VF13, +#endif +#ifdef JIT_VF14 + JIT_VF14, +#endif +#ifdef JIT_VF15 + JIT_VF15, +#endif +#ifdef JIT_VF16 + JIT_VF16, +#endif +}; + +#define ARRAY_SIZE(X) (sizeof (X)/sizeof ((X)[0])) +static const size_t r_count = ARRAY_SIZE (rregs); +static const size_t v_count = ARRAY_SIZE (vregs); +static const size_t f_count = ARRAY_SIZE (fregs); +static const size_t vf_count = ARRAY_SIZE (vfregs); +static const size_t gpr_count = r_count + v_count; + +static jit_gpr_t +gpr_ref (uintptr_t i) +{ + if (i < r_count) + return rregs[i]; + if (i < r_count + v_count) + return vregs[i - r_count]; + abort (); +}