From f31fb0044d6a7275c9a4bf3300b111a902f787f7 Mon Sep 17 00:00:00 2001 From: Helmut Eller Date: Wed, 8 Jun 2022 16:20:42 +0200 Subject: [PATCH 01/23] Fix some problems with callr and calli. The problem with callr is that the register that contains the function to be called, can be overwritten by the logic that moves the values into argument registers. To fix this, I added a get_callr_temp function that should return a platform specific register that is not used to pass arguments. For Aarch64/Arm the link registers seems to work; for Amd64/i686 the RAX register. The function/tmp pair becomes an additional argument to the parallel assigment; this way the original function register is not accidentally overwritten. The problem with calli is that it may not have enough temp registers to move arguments. The windmill paper says that at most one temporary register is needed for the parallel assignment. However, we also need a temp register for mem-to-mem moves. So it seems that we need a second temporary. For Amd64/i686 we have only one temporary GPR and one temporary FPR. To fix this, I modified the algorithm from the paper a bit: we perform the mem-to-mem moves before the other moves. Later when we need the temp to break cycles, there shouldn't be any mem-to-mem moves left. So we should never need two temps at the same time. * lightening/lightening.c: (get_callr_temp): New function; need for each platform. (prepare_call_args): Include the function/callr_temp pair in the arguments for the parallel assignment. * lightening/x86.c, lightening/arm.c, lightening/aarch64.c (get_callr_temp): Implementation for each platform. * lightening/arm.c (next_abi_arg): Fix the stack size for doubles. * tests/call_10_2.c, tests/callr_10.c: New tests. * tests/regarrays.inc: New file. Common code between the above two tests that would be tedious to duplicate. --- lightening/aarch64.c | 6 ++ lightening/arm.c | 8 +- lightening/lightening.c | 35 +++++-- lightening/x86.c | 6 ++ tests/call_10_2.c | 165 ++++++++++++++++++++++++++++++++ tests/callr_10.c | 66 +++++++++++++ tests/regarrays.inc | 206 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 485 insertions(+), 7 deletions(-) create mode 100644 tests/call_10_2.c create mode 100644 tests/callr_10.c create mode 100644 tests/regarrays.inc diff --git a/lightening/aarch64.c b/lightening/aarch64.c index e67365f23..fb14f3d8f 100644 --- a/lightening/aarch64.c +++ b/lightening/aarch64.c @@ -232,3 +232,9 @@ bless_function_pointer(void *ptr) { return ptr; } + +static jit_gpr_t +get_callr_temp (jit_state_t * _jit) +{ + return _LR; +} diff --git a/lightening/arm.c b/lightening/arm.c index d587e7158..11deedd89 100644 --- a/lightening/arm.c +++ b/lightening/arm.c @@ -109,7 +109,7 @@ next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg) } } *arg = jit_operand_mem (abi, JIT_SP, iter->stack_size); - iter->stack_size += 4; + iter->stack_size += 4 + (abi == JIT_OPERAND_ABI_DOUBLE ? 4 : 0); } static void @@ -137,3 +137,9 @@ bless_function_pointer(void *ptr) // Set low bit to mark as thumb mode. return (void*) (((uintptr_t)ptr) | 1); } + +static jit_gpr_t +get_callr_temp (jit_state_t * _jit) +{ + return _LR; +} diff --git a/lightening/lightening.c b/lightening/lightening.c index 1254514ae..afc6fd493 100644 --- a/lightening/lightening.c +++ b/lightening/lightening.c @@ -124,6 +124,8 @@ static void reset_abi_arg_iterator(struct abi_arg_iterator *iter, size_t argc, static void next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg); +static jit_gpr_t get_callr_temp (jit_state_t * _jit); + jit_bool_t init_jit(void) { @@ -1096,6 +1098,15 @@ jit_move_operands(jit_state_t *_jit, jit_operand_t *dst, jit_operand_t *src, enum move_status status[argc]; for (size_t i = 0; i < argc; i++) status[i] = TO_MOVE; + + // Mem-to-mem moves require a temp register but don't overwrite + // other argument registers. Perform them first to free up the tmp + // for other uses. + for (size_t i = 0; i < argc; i++) + if ((status[i] == TO_MOVE) + && (MOVE_KIND (src[i].kind, dst[i].kind) == MOVE_MEM_TO_MEM)) + move_one(_jit, dst, src, argc, status, i); + for (size_t i = 0; i < argc; i++) if (status[i] == TO_MOVE) move_one(_jit, dst, src, argc, status, i); @@ -1236,11 +1247,23 @@ jit_leave_jit_abi(jit_state_t *_jit, size_t v, size_t vf, size_t frame_size) // Precondition: stack is already aligned. static size_t -prepare_call_args(jit_state_t *_jit, size_t argc, jit_operand_t args[]) +prepare_call_args(jit_state_t *_jit, size_t argc, jit_operand_t args[], + jit_gpr_t *fun) { - jit_operand_t dst[argc]; + size_t count = argc + (fun == NULL ? 0 : 1); + jit_operand_t src[count]; + jit_operand_t dst[count]; + + memcpy (src, args, sizeof (jit_operand_t) * argc); + if (fun != NULL) { + jit_gpr_t fun_tmp = argc == 0 ? *fun : get_callr_temp (_jit); + src[argc] = jit_operand_gpr (JIT_OPERAND_ABI_POINTER, *fun); + dst[argc] = jit_operand_gpr (JIT_OPERAND_ABI_POINTER, fun_tmp); + *fun = fun_tmp; + } + struct abi_arg_iterator iter; - + // Compute shuffle destinations and space for spilled arguments. reset_abi_arg_iterator(&iter, argc, args); for (size_t i = 0; i < argc; i++) @@ -1265,7 +1288,7 @@ prepare_call_args(jit_state_t *_jit, size_t argc, jit_operand_t args[]) } } - jit_move_operands(_jit, dst, args, argc); + jit_move_operands(_jit, dst, src, count); return stack_size; } @@ -1273,7 +1296,7 @@ prepare_call_args(jit_state_t *_jit, size_t argc, jit_operand_t args[]) void jit_calli(jit_state_t *_jit, jit_pointer_t f, size_t argc, jit_operand_t args[]) { - size_t stack_bytes = prepare_call_args(_jit, argc, args); + size_t stack_bytes = prepare_call_args(_jit, argc, args, NULL); calli(_jit, (jit_word_t)f); @@ -1283,7 +1306,7 @@ jit_calli(jit_state_t *_jit, jit_pointer_t f, size_t argc, jit_operand_t args[]) void jit_callr(jit_state_t *_jit, jit_gpr_t f, size_t argc, jit_operand_t args[]) { - size_t stack_bytes = prepare_call_args(_jit, argc, args); + size_t stack_bytes = prepare_call_args(_jit, argc, args, &f); callr(_jit, jit_gpr_regno(f)); diff --git a/lightening/x86.c b/lightening/x86.c index f8ac4b0b8..873cb27a4 100644 --- a/lightening/x86.c +++ b/lightening/x86.c @@ -405,3 +405,9 @@ bless_function_pointer(void *ptr) { return ptr; } + +static jit_gpr_t +get_callr_temp (jit_state_t * _jit) +{ + return _RAX; +} diff --git a/tests/call_10_2.c b/tests/call_10_2.c new file mode 100644 index 000000000..189757876 --- /dev/null +++ b/tests/call_10_2.c @@ -0,0 +1,165 @@ +#include "test.h" +#include "regarrays.inc" + +#define DEFINE_TEST_INT(ABI_TYPE, TYPE, LIT, NEGATE) \ +static TYPE \ +check_##TYPE (TYPE a, TYPE b, TYPE c, TYPE d, TYPE e, \ + TYPE f, TYPE g, TYPE h, TYPE i, TYPE j) \ +{ \ + ASSERT(a == LIT(0)); \ + ASSERT(b == NEGATE(1)); \ + ASSERT(c == LIT(2)); \ + ASSERT(d == NEGATE(3)); \ + ASSERT(e == LIT(4)); \ + ASSERT(f == NEGATE(5)); \ + ASSERT(g == LIT(6)); \ + ASSERT(h == NEGATE(7)); \ + ASSERT(i == LIT(8)); \ + ASSERT(j == NEGATE(9)); \ + return LIT(42); \ +} \ + \ +static void \ +run_test_##TYPE (jit_state_t *j, uint8_t *arena_base, size_t arena_size, \ + jit_gpr_t base) \ +{ \ + jit_begin(j, arena_base, arena_size); \ + size_t align = jit_enter_jit_abi(j, v_count, 0, 0); \ + jit_load_args_1(j, jit_operand_gpr (JIT_OPERAND_ABI_POINTER, base)); \ + \ + jit_operand_t args[10] = { \ + jit_operand_mem(ABI_TYPE, base, 0 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 1 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 2 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 3 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 4 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 5 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 6 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 7 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 8 * sizeof(TYPE)), \ + jit_operand_mem(ABI_TYPE, base, 9 * sizeof(TYPE)), \ + }; \ + jit_calli(j, check_##TYPE, 10, args); \ + jit_leave_jit_abi(j, v_count, 0, align); \ + jit_ret(j); \ + \ + size_t size = 0; \ + void* ret = jit_end(j, &size); \ + \ + TYPE (*f)(TYPE*) = ret; \ + \ + TYPE iargs[10] = { LIT(0), NEGATE(1), LIT(2), NEGATE(3), LIT(4), \ + NEGATE(5), LIT(6), NEGATE(7), LIT(8), NEGATE(9) }; \ + ASSERT(f(iargs) == LIT(42)); \ +} + +#define LIT(X) (X) +#define NEGATE(X) (-X) +DEFINE_TEST_INT(JIT_OPERAND_ABI_INT32, int32_t, LIT, NEGATE); +#if (UINTPTR_MAX == UINT64_MAX) +DEFINE_TEST_INT(JIT_OPERAND_ABI_INT64, int64_t, LIT, NEGATE); +#endif +#undef NEGATE + +#define NEGATE(X) (~X) +DEFINE_TEST_INT(JIT_OPERAND_ABI_UINT32, uint32_t, LIT, NEGATE); +#if (UINTPTR_MAX == UINT64_MAX) +DEFINE_TEST_INT(JIT_OPERAND_ABI_UINT64, uint64_t, LIT, NEGATE); +#endif +#undef NEGATE +#undef LIT + +typedef uint8_t* ptr_t; +#define LIT(X) ((ptr_t)(uintptr_t)(X)) +#define NEGATE(X) ((ptr_t)(~(uintptr_t)(X))) +DEFINE_TEST_INT(JIT_OPERAND_ABI_POINTER, ptr_t, LIT, NEGATE); + +static double +check_double (double a, double b, double c, double d, double e, + double f, double g, double h, double i, double j) +{ + ASSERT(a == 0.0); + ASSERT(b == -1.0); + ASSERT(c == -0xfffffffffffffp+100l); + ASSERT(d == +0xfffffffffffffp-100l); + ASSERT(e == -0xfffffffffffffp+101l); + ASSERT(f == +0xfffffffffffffp-102l); + ASSERT(g == -0xfffffffffffffp+102l); + ASSERT(h == +0xfffffffffffffp-103l); + ASSERT(i == -0xfffffffffffffp+103l); + ASSERT(j == +0xfffffffffffffp-104l); + return 42; +} + +static void +run_test_double (jit_state_t *j, uint8_t *arena_base, size_t arena_size, + jit_gpr_t base) +{ + double dargs[10] = { + 0.0, + -1.0, + -0xfffffffffffffp+100l, + +0xfffffffffffffp-100l, + -0xfffffffffffffp+101l, + +0xfffffffffffffp-102l, + -0xfffffffffffffp+102l, + +0xfffffffffffffp-103l, + -0xfffffffffffffp+103l, + +0xfffffffffffffp-104l, + }; + jit_begin(j, arena_base, arena_size); + size_t align = jit_enter_jit_abi(j, v_count, 0, 0); + jit_load_args_1(j, jit_operand_gpr (JIT_OPERAND_ABI_POINTER, base)); + enum jit_operand_abi abi = JIT_OPERAND_ABI_DOUBLE; + jit_movi_d(j, JIT_F0, dargs[0]); + jit_movi_d(j, JIT_F1, dargs[1]); + jit_movi_d(j, JIT_F2, dargs[2]); + jit_movi_d(j, JIT_F3, dargs[3]); + jit_movi_d(j, JIT_F4, dargs[4]); + jit_movi_d(j, JIT_F5, dargs[5]); + jit_movi_d(j, JIT_F6, dargs[6]); + jit_operand_t args[10] = { + jit_operand_fpr(abi, JIT_F0), + jit_operand_fpr(abi, JIT_F1), + jit_operand_fpr(abi, JIT_F2), + jit_operand_fpr(abi, JIT_F3), + jit_operand_fpr(abi, JIT_F4), + jit_operand_fpr(abi, JIT_F5), + jit_operand_fpr(abi, JIT_F6), + jit_operand_mem(abi, base, 7 * sizeof(double)), + jit_operand_mem(abi, base, 8 * sizeof(double)), + jit_operand_mem(abi, base, 9 * sizeof(double)), + }; + jit_calli(j, check_double, 10, args); + jit_leave_jit_abi(j, v_count, 0, align); + jit_ret(j); + + size_t size = 0; + void* ret = jit_end(j, &size); + + double (*f)(double*) = ret; + + ASSERT(f(dargs) == 42); +} + +static void +run_test (jit_state_t * j, uint8_t * arena_base, size_t arena_size) +{ + for (unsigned i = 0; i < gpr_count; i++) + { + run_test_int32_t (j, arena_base, arena_size, gpr_ref (i)); + run_test_uint32_t (j, arena_base, arena_size, gpr_ref (i)); +#if (UINTPTR_MAX == UINT64_MAX) + run_test_int64_t (j, arena_base, arena_size, gpr_ref (i)); + run_test_uint64_t (j, arena_base, arena_size, gpr_ref (i)); +#endif + run_test_ptr_t (j, arena_base, arena_size, gpr_ref (i)); + run_test_double (j, arena_base, arena_size, gpr_ref (i)); + } +} + +int +main (int argc, char *argv[]) +{ + return main_helper(argc, argv, run_test); +} diff --git a/tests/callr_10.c b/tests/callr_10.c new file mode 100644 index 000000000..bca488c75 --- /dev/null +++ b/tests/callr_10.c @@ -0,0 +1,66 @@ +#include "test.h" +#include "regarrays.inc" + +static int32_t f(int32_t a, int32_t b, int32_t c, int32_t d, int32_t e, + int32_t f, int32_t g, int32_t h, int32_t i, int32_t j) { + ASSERT(a == 0); + ASSERT(b == 1); + ASSERT(c == 2); + ASSERT(d == 3); + ASSERT(e == 4); + ASSERT(f == 5); + ASSERT(g == 6); + ASSERT(h == 7); + ASSERT(i == 8); + ASSERT(j == 9); + return 42; +} + +static void +run_test_2 (jit_state_t *j, uint8_t *arena_base, size_t arena_size, + jit_gpr_t base, jit_gpr_t fun) +{ + jit_begin(j, arena_base, arena_size); + size_t align = jit_enter_jit_abi(j, v_count, 0, 0); + jit_load_args_1(j, jit_operand_gpr (JIT_OPERAND_ABI_POINTER, base)); + + jit_operand_t args[10] = { + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 0 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 1 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 2 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 3 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 4 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 5 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 6 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 7 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 8 * sizeof(int32_t)), + jit_operand_mem(JIT_OPERAND_ABI_INT32, base, 9 * sizeof(int32_t)) + }; + jit_movi(j, fun, (uintptr_t)f); + jit_callr(j, fun, 10, args); + jit_leave_jit_abi(j, v_count, 0, align); + jit_ret(j); + + size_t size = 0; + void* ret = jit_end(j, &size); + + int32_t (*f)(int32_t*) = ret; + + int32_t iargs[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + ASSERT(f(iargs) == 42); +} + +static void +run_test (jit_state_t *jit, uint8_t *arena_base, size_t arena_size) +{ + for (unsigned i = 0; i < gpr_count; i++) + for (unsigned j = 0; j < gpr_count; j++) + if (i != j) + run_test_2 (jit, arena_base, arena_size, gpr_ref(i), gpr_ref(j)); +} + +int +main (int argc, char *argv[]) +{ + return main_helper(argc, argv, run_test); +} diff --git a/tests/regarrays.inc b/tests/regarrays.inc new file mode 100644 index 000000000..de56c905c --- /dev/null +++ b/tests/regarrays.inc @@ -0,0 +1,206 @@ +/* Arrays describing the available user registers. -*- mode: c -*- */ + +// #ifdef orgy factored out to common include file + +static const jit_gpr_t rregs[] = { + JIT_R0, + JIT_R1, + JIT_R2, +#ifdef JIT_R3 + JIT_R3, +#endif +#ifdef JIT_R4 + JIT_R4, +#endif +#ifdef JIT_R5 + JIT_R5, +#endif +#ifdef JIT_R6 + JIT_R6, +#endif +#ifdef JIT_R7 + JIT_R7, +#endif +#ifdef JIT_R8 + JIT_R8, +#endif +#ifdef JIT_R9 + JIT_R9, +#endif +#ifdef JIT_R10 + JIT_R10, +#endif +#ifdef JIT_R11 + JIT_R11, +#endif +#ifdef JIT_R12 + JIT_R12, +#endif +#ifdef JIT_R13 + JIT_R13, +#endif +#ifdef JIT_R14 + JIT_R14, +#endif +#ifdef JIT_R15 + JIT_R15, +#endif +#ifdef JIT_R16 + JIT_R16, +#endif +}; + +static const jit_gpr_t vregs[] = { + JIT_V0, JIT_V1, JIT_V2, +#ifdef JIT_V3 + JIT_V3, +#endif +#ifdef JIT_V4 + JIT_V4, +#endif +#ifdef JIT_V5 + JIT_V5, +#endif +#ifdef JIT_V6 + JIT_V6, +#endif +#ifdef JIT_V7 + JIT_V7, +#endif +#ifdef JIT_V8 + JIT_V8, +#endif +#ifdef JIT_V9 + JIT_V9, +#endif +#ifdef JIT_V10 + JIT_V10, +#endif +#ifdef JIT_V11 + JIT_V11, +#endif +#ifdef JIT_V12 + JIT_V12, +#endif +#ifdef JIT_V13 + JIT_V13, +#endif +#ifdef JIT_V14 + JIT_V14, +#endif +#ifdef JIT_V15 + JIT_V15, +#endif +#ifdef JIT_V16 + JIT_V16, +#endif +}; + +static const jit_fpr_t fregs[] = { + JIT_F0, JIT_F1, JIT_F2, + JIT_F2, JIT_F3, JIT_F4, +#ifdef JIT_F7 + JIT_F7, +#endif +#ifdef JIT_F8 + JIT_F8, +#endif +#ifdef JIT_F9 + JIT_F9, +#endif +#ifdef JIT_F10 + JIT_F10, +#endif +#ifdef JIT_F11 + JIT_F11, +#endif +#ifdef JIT_F12 + JIT_F12, +#endif +#ifdef JIT_F13 + JIT_F13, +#endif +#ifdef JIT_F14 + JIT_F14, +#endif +#ifdef JIT_F15 + JIT_F15, +#endif +#ifdef JIT_F16 + JIT_F16, +#endif +}; + +static const jit_fpr_t vfregs[] = { +#ifdef JIT_VF0 + JIT_VF0, +#endif +#ifdef JIT_VF1 + JIT_VF1, +#endif +#ifdef JIT_VF2 + JIT_VF2, +#endif +#ifdef JIT_VF2 + JIT_VF2, +#endif +#ifdef JIT_VF3 + JIT_VF3, +#endif +#ifdef JIT_VF4 + JIT_VF4, +#endif +#ifdef JIT_VF5 + JIT_VF5, +#endif +#ifdef JIT_VF6 + JIT_VF6, +#endif +#ifdef JIT_VF7 + JIT_VF7, +#endif +#ifdef JIT_VF8 + JIT_VF8, +#endif +#ifdef JIT_VF9 + JIT_VF9, +#endif +#ifdef JIT_VF10 + JIT_VF10, +#endif +#ifdef JIT_VF11 + JIT_VF11, +#endif +#ifdef JIT_VF12 + JIT_VF12, +#endif +#ifdef JIT_VF13 + JIT_VF13, +#endif +#ifdef JIT_VF14 + JIT_VF14, +#endif +#ifdef JIT_VF15 + JIT_VF15, +#endif +#ifdef JIT_VF16 + JIT_VF16, +#endif +}; + +#define ARRAY_SIZE(X) (sizeof (X)/sizeof ((X)[0])) +static const size_t r_count = ARRAY_SIZE (rregs); +static const size_t v_count = ARRAY_SIZE (vregs); +static const size_t f_count = ARRAY_SIZE (fregs); +static const size_t vf_count = ARRAY_SIZE (vfregs); +static const size_t gpr_count = r_count + v_count; + +static jit_gpr_t +gpr_ref (uintptr_t i) +{ + if (i < r_count) + return rregs[i]; + if (i < r_count + v_count) + return vregs[i - r_count]; + abort (); +} From 436a8b278b2a6e3771efb2c322ad1fdedcd599af Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Mon, 22 Apr 2024 15:14:13 +0200 Subject: [PATCH 02/23] aarch64: Fix duplicate declaration --- lightening/aarch64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lightening/aarch64.c b/lightening/aarch64.c index 1018193c4..4ff4ea96e 100644 --- a/lightening/aarch64.c +++ b/lightening/aarch64.c @@ -165,7 +165,6 @@ struct abi_arg_iterator }; static size_t page_size; -static int has_lse_atomics; # define HWCAP_ATOMICS (1 << 8) From 11918685e122597a3443df3e273e0c8327d1251a Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Wed, 29 Jan 2025 12:14:59 +0100 Subject: [PATCH 03/23] Add movr_f_i, movr_i_f, movr_d_l, movr_l_d These move values verbatim between FPRs and GPRs. --- lightening.h | 6 +++++- lightening/aarch64-fpu.c | 26 +++++++++++++++++++++++++- lightening/arm-vfp.c | 14 +++++++++++++- lightening/x86-sse.c | 36 +++++++++++++++++++++++++++++++++++- tests/movr_dl.c | 26 ++++++++++++++++++++++++++ tests/movr_fi.c | 24 ++++++++++++++++++++++++ 6 files changed, 128 insertions(+), 4 deletions(-) create mode 100644 tests/movr_dl.c create mode 100644 tests/movr_fi.c diff --git a/lightening.h b/lightening.h index efa5dfdf1..1b296bd66 100644 --- a/lightening.h +++ b/lightening.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2020 Free Software Foundation, Inc. + * Copyright (C) 2012-2020, 2025 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -622,6 +622,10 @@ jit_load_args_3(jit_state_t *_jit, jit_operand_t a, jit_operand_t b, M(_FF__, extr_f_d) \ M(_FF__, movr_f) \ M(_FF__, movr_d) \ + M(_GF__, movr_i_f) \ + M(_FG__, movr_f_i) \ + WHEN_64(M(_GF__, movr_l_d)) \ + WHEN_64(M(_FG__, movr_d_l)) \ M(_Ff__, movi_f) \ M(_Fd__, movi_d) \ M(_GF__, truncr_d_i) \ diff --git a/lightening/aarch64-fpu.c b/lightening/aarch64-fpu.c index 629734264..80dee334d 100644 --- a/lightening/aarch64-fpu.c +++ b/lightening/aarch64-fpu.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2013-2019 Free Software Foundation, Inc. + * Copyright (C) 2013-2019, 2025 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -638,6 +638,18 @@ movi_f(jit_state_t *_jit, int32_t r0, float i0) } } +static void +movr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + FMOVSW(_jit, r0, r1); +} + +static void +movr_i_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + FMOVWS(_jit, r0, r1); +} + static jit_reloc_t buneqr_f(jit_state_t *_jit, int32_t r0, int32_t r1) { @@ -759,6 +771,18 @@ movi_d(jit_state_t *_jit, int32_t r0, double i0) } } +static void +movr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + FMOVDX(_jit, r0, r1); +} + +static void +movr_l_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + FMOVXD(_jit, r0, r1); +} + static jit_reloc_t buneqr_d(jit_state_t *_jit, int32_t r0, int32_t r1) { diff --git a/lightening/arm-vfp.c b/lightening/arm-vfp.c index 208edc316..63134dcf5 100644 --- a/lightening/arm-vfp.c +++ b/lightening/arm-vfp.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2017, 2019 Free Software Foundation, Inc. + * Copyright (C) 2012-2017, 2019, 2025 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -913,6 +913,18 @@ movi_d(jit_state_t *_jit, int32_t r0, jit_float64_t i0) } } +static void +movr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + VMOV_S_A(_jit, r0, r1); +} + +static void +movr_i_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + VMOV_A_S32(_jit, r0, r1); +} + static void extr_d_f(jit_state_t *_jit, int32_t r0, int32_t r1) { diff --git a/lightening/x86-sse.c b/lightening/x86-sse.c index ab66dc7c5..0331ff056 100644 --- a/lightening/x86-sse.c +++ b/lightening/x86-sse.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2012-2017, 2019 Free Software Foundation, Inc. + * Copyright (C) 2012-2017, 2019, 2025 Free Software Foundation, Inc. * * This file is part of GNU lightning. * @@ -128,13 +128,24 @@ movdlxr(jit_state_t *_jit, int32_t r0, int32_t r1) { ssexr(_jit, 0x66, X86_SSE_X2G, r0, r1); } +static void +movdlrx(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + ssexr(_jit, 0x66, X86_SSE_G2X, r0, r1); +} static void movdqxr(jit_state_t *_jit, int32_t r0, int32_t r1) maybe_unused; +static void movdqrx(jit_state_t *_jit, int32_t r0, int32_t r1) maybe_unused; static void movdqxr(jit_state_t *_jit, int32_t r0, int32_t r1) { sselxr(_jit, 0x66, X86_SSE_X2G, r0, r1); } +static void +movdqrx(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + sselxr(_jit, 0x66, X86_SSE_G2X, r0, r1); +} static void movssmr(jit_state_t *_jit, int32_t md, int32_t rb, int32_t ri, int32_t ms, int32_t rd) @@ -171,6 +182,29 @@ movr_d(jit_state_t *_jit, int32_t r0, int32_t r1) ssexr(_jit, 0xf2, X86_SSE_MOV, r0, r1); } +static void +movr_i_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + movdlrx(_jit, r0, r1); +} +static void +movr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + movdlxr(_jit, r0, r1); +} +#if __X64 +static void +movr_l_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + movdqrx(_jit, r0, r1); +} +static void +movr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + movdqxr(_jit, r0, r1); +} +#endif + static void addssr(jit_state_t *_jit, int32_t r0, int32_t r1) { diff --git a/tests/movr_dl.c b/tests/movr_dl.c new file mode 100644 index 000000000..029d41bcc --- /dev/null +++ b/tests/movr_dl.c @@ -0,0 +1,26 @@ +#include "test.h" + +static void +run_test(jit_state_t *j, uint8_t *arena_base, size_t arena_size) +{ +#if __WORDSIZE > 32 + jit_begin(j, arena_base, arena_size); + size_t align = jit_enter_jit_abi(j, 0, 0, 0); + + jit_movi_d(j, JIT_F0, 3.14159); + jit_movr_l_d(j, JIT_R0, JIT_F0); + jit_movr_d_l(j, JIT_F1, JIT_R0); + jit_leave_jit_abi(j, 0, 0, align); + jit_retr_d(j, JIT_F1); + + double (*f)(void) = jit_end(j, NULL); + + ASSERT(f() == 3.14159); +#endif +} + +int +main (int argc, char *argv[]) +{ + return main_helper(argc, argv, run_test); +} diff --git a/tests/movr_fi.c b/tests/movr_fi.c new file mode 100644 index 000000000..f8d3bdf4c --- /dev/null +++ b/tests/movr_fi.c @@ -0,0 +1,24 @@ +#include "test.h" + +static void +run_test(jit_state_t *j, uint8_t *arena_base, size_t arena_size) +{ + jit_begin(j, arena_base, arena_size); + size_t align = jit_enter_jit_abi(j, 0, 0, 0); + + jit_movi_f(j, JIT_F0, 3.14159); + jit_movr_i_f(j, JIT_R0, JIT_F0); + jit_movr_f_i(j, JIT_F1, JIT_R0); + jit_leave_jit_abi(j, 0, 0, align); + jit_retr_f(j, JIT_F1); + + float (*f)(void) = jit_end(j, NULL); + + ASSERT(f() == 3.14159f); +} + +int +main (int argc, char *argv[]) +{ + return main_helper(argc, argv, run_test); +} From 23c4e36dcaa7f0758023ad58208dcf645bedef2f Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Thu, 13 May 2021 17:52:28 +0200 Subject: [PATCH 04/23] Makefile: RISCV support and optional vars Optional variables are needed because the structure of the makefile is prepared to run on Guix but Guix doesn't support RISCV yet, so it's better to set them as optional and let the user decide how do they want to compile this thing. --- tests/Makefile | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tests/Makefile b/tests/Makefile index 769b43423..5c44bceae 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -1,5 +1,5 @@ -TESTS=$(sort $(basename $(wildcard *.c))) -TARGETS ?= native ia32 aarch64 armv7 +TESTS ?= $(sort $(basename $(wildcard *.c))) +TARGETS ?= native ia32 aarch64 armv7 riscv # Suitable values of cross-compiler variables for Debian: # @@ -17,10 +17,11 @@ TARGETS ?= native ia32 aarch64 armv7 # gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6:arm64 # CC = gcc -CC_IA32=guix environment --pure -s i686-linux --ad-hoc gcc-toolchain -- gcc -CC_AARCH64=guix environment --pure -s aarch64-linux --ad-hoc gcc-toolchain -- gcc -CC_ARMv7=guix environment --pure -s armhf-linux --ad-hoc gcc-toolchain -- gcc -CFLAGS = -Wall -O0 -g +CC_IA32 ?= guix environment --pure -s i686-linux --ad-hoc gcc-toolchain -- gcc +CC_AARCH64 ?= guix environment --pure -s aarch64-linux --ad-hoc gcc-toolchain -- gcc +CC_ARMv7 ?= guix environment --pure -s armhf-linux --ad-hoc gcc-toolchain -- gcc +CC_RISCV ?= guix environment --pure -s riscv64-linux --ad-hoc gcc-toolchain -- gcc +CFLAGS ?= -Wall -O0 -g all: $(foreach TARGET,$(TARGETS),$(addprefix test-$(TARGET)-,$(TESTS))) @@ -54,6 +55,10 @@ test-armv7-%: CC = $(CC_ARMv7) test-armv7-%: %.c lightening-armv7.o test.h $(CC) $(CFLAGS) $(CPPFLAGS) -I.. -o $@ lightening-armv7.o $< +test-riscv-%: CC = $(CC_RISCV) +test-riscv-%: %.c lightening-riscv.o test.h + $(CC) $(CFLAGS) $(CPPFLAGS) -I.. -o $@ lightening-riscv.o $< + .PRECIOUS: $(foreach TARGET,$(TARGETS),$(addprefix test-$(TARGET)-,$(TESTS))) .PRECIOUS: $(foreach TARGET,$(TARGETS),lightening-$(TARGET).o) From cbd72e71a7197912d6473743fba6799157c2b196 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Sun, 9 May 2021 16:39:03 +0200 Subject: [PATCH 05/23] RISC-V Support --- lightening.am | 6 +- lightening.h | 2 + lightening/endian.h | 2 + lightening/lightening.c | 33 + lightening/riscv-cpu.c | 2470 +++++++++++++++++++++++++++++++++++++++ lightening/riscv-fpu.c | 858 ++++++++++++++ lightening/riscv.c | 327 ++++++ lightening/riscv.h | 194 +++ 8 files changed, 3891 insertions(+), 1 deletion(-) create mode 100644 lightening/riscv-cpu.c create mode 100644 lightening/riscv-fpu.c create mode 100644 lightening/riscv.c create mode 100644 lightening/riscv.h diff --git a/lightening.am b/lightening.am index 2c9089ead..ba55f2c7f 100644 --- a/lightening.am +++ b/lightening.am @@ -40,6 +40,7 @@ lightening_extra_files = \ $(lightening)/lightening/mips.h \ $(lightening)/lightening/ppc.h \ $(lightening)/lightening/x86.h \ + $(lightening)/lightening/riscv.h \ \ $(lightening)/lightening/aarch64.c \ $(lightening)/lightening/aarch64-cpu.c \ @@ -55,4 +56,7 @@ lightening_extra_files = \ $(lightening)/lightening/ppc-fpu.c \ $(lightening)/lightening/x86.c \ $(lightening)/lightening/x86-cpu.c \ - $(lightening)/lightening/x86-sse.c + $(lightening)/lightening/x86-sse.c \ + $(lightening)/lightening/riscv.c \ + $(lightening)/lightening/riscv-cpu.c \ + $(lightening)/lightening/riscv-fpu.c diff --git a/lightening.h b/lightening.h index 1b296bd66..b364e18cc 100644 --- a/lightening.h +++ b/lightening.h @@ -77,6 +77,8 @@ jit_same_fprs (jit_fpr_t a, jit_fpr_t b) # include "lightening/aarch64.h" #elif defined(__s390__) || defined(__s390x__) # include "lightening/s390.h" +#elif defined(__riscv__) || defined(__riscv) +# include "lightening/riscv.h" #endif enum jit_reloc_kind diff --git a/lightening/endian.h b/lightening/endian.h index 3b34a1518..e3689a117 100644 --- a/lightening/endian.h +++ b/lightening/endian.h @@ -38,6 +38,8 @@ # else # define __WORDSIZE 64 # endif +# elif defined(__riscv_xlen) +# define __WORDSIZE __riscv_xlen /* riscv */ # else /* From FreeBSD 9.1 stdint.h */ # if defined(UINTPTR_MAX) && defined(UINT64_MAX) && \ (UINTPTR_MAX == UINT64_MAX) diff --git a/lightening/lightening.c b/lightening/lightening.c index afc6fd493..c66b3a132 100644 --- a/lightening/lightening.c +++ b/lightening/lightening.c @@ -271,6 +271,22 @@ get_temp_gpr(jit_state_t *_jit) #ifdef JIT_TMP1 case 1: return JIT_TMP1; +#endif +#ifdef JIT_TMP2 + case 2: + return JIT_TMP2; +#endif +#ifdef JIT_TMP3 + case 3: + return JIT_TMP3; +#endif +#ifdef JIT_TMP4 + case 4: + return JIT_TMP4; +#endif +#ifdef JIT_TMP5 + case 5: + return JIT_TMP5; #endif default: abort(); @@ -561,6 +577,8 @@ jit_emit_addr(jit_state_t *j) # include "aarch64.c" #elif defined(__s390__) || defined(__s390x__) # include "s390.c" +#elif defined(__riscv__) || defined(__riscv) +# include "riscv.c" #endif #define JIT_IMPL_0(stem, ret) \ @@ -1167,6 +1185,9 @@ static const jit_gpr_t user_callee_save_gprs[] = { #endif #ifdef JIT_V9 , JIT_V9 +#endif +#ifdef JIT_V10 + , JIT_V10 #endif }; @@ -1195,6 +1216,18 @@ static const jit_fpr_t user_callee_save_fprs[] = { #ifdef JIT_VF7 , JIT_VF7 #endif +#ifdef JIT_VF8 + , JIT_VF8 +#endif +#ifdef JIT_VF9 + , JIT_VF9 +#endif +#ifdef JIT_VF10 + , JIT_VF10 +#endif +#ifdef JIT_VF11 + , JIT_VF11 +#endif }; #define ARRAY_SIZE(X) (sizeof (X)/sizeof ((X)[0])) diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c new file mode 100644 index 000000000..d9d36f30b --- /dev/null +++ b/lightening/riscv-cpu.c @@ -0,0 +1,2470 @@ +/* + * Copyright (C) 2012-2024 Free Software Foundation, Inc. + * + * This file is part of GNU lightning. + * + * GNU lightning is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU lightning is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * Authors: + * Paulo Cesar Pereira de Andrade + * Ekaitz Zarraga + */ +#define stack_framesize (200 + 64) +#define simm6_p(im) ((im) <= 31 && (im) >= -32) +#define simm12_p(im) ((im) <= 2047 && (im) >= -2048) +#define simm20_p(im) ((im) <= 524287 && (im) >= -524288) +#define simm32_p(im) ((im) <= 2147483647LL && (im) >= -2147483648LL) + +typedef union { + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t funct7 : 7; + } R; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t rl : 1; + uint32_t aq : 1; + uint32_t funct5 : 5; + } R4; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t imm11_0 : 12; + } I; +#if __WORDSIZE == 64 + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t shamt : 6; + uint32_t imm6_0 : 6; + } IS; +#endif + struct { + uint32_t opcode : 7; + uint32_t imm4_0 : 5; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t imm11_5 : 7; + } S; + struct { + uint32_t opcode : 7; + uint32_t imm11 : 1; + uint32_t imm4_1 : 4; + uint32_t funct3 : 3; + uint32_t rs1 : 5; + uint32_t rs2 : 5; + uint32_t imm10_5 : 6; + uint32_t imm12 : 1; + } B; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t imm31_12 : 20; + } U; + struct { + uint32_t opcode : 7; + uint32_t rd : 5; + uint32_t imm19_12 : 8; + uint32_t imm11 : 1; + uint32_t imm10_1 : 10; + uint32_t imm20 : 1; + } J; + uint32_t w; +} instr_t; + + +// TODO: Compressed instruction support + +static uint32_t +Rtype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t rs2, + int32_t fct2) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(!(fct2 & ~0x7f)); + i.R.opcode = op; + i.R.rd = rd; + i.R.funct3 = fct; + i.R.rs1 = rs1; + i.R.rs2 = rs2; + i.R.funct7 = fct2; + return i.w; +} + +static uint32_t +R4type(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t rs2, + int32_t aq, int32_t rl, int32_t fct5) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(!(fct5 & ~0x1f)); + assert(!(aq & ~0x01)); + assert(!(rl & ~0x01)); + i.R4.opcode = op; + i.R4.rd = rd; + i.R4.funct3 = fct; + i.R4.rs1 = rs1; + i.R4.rs2 = rs2; + i.R4.aq = aq; + i.R4.rl = rl; + i.R4.funct5 = fct5; + return i.w; +} + +static uint32_t +Itype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(simm12_p(imm)); + i.I.opcode = op; + i.I.rd = rd; + i.I.funct3 = fct; + i.I.rs1 = rs1; + i.I.imm11_0 = imm; + return i.w; +} + +# if __WORDSIZE == 64 + static uint32_t +IStype(int32_t op, int32_t rd, int32_t fct, int32_t rs1, int32_t sh, + int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(sh & ~0x3f)); + assert(simm6_p(imm)); + i.IS.opcode = op; + i.IS.rd = rd; + i.IS.funct3 = fct; + i.IS.rs1 = rs1; + i.IS.shamt = sh; + i.IS.imm6_0 = imm; + return i.w; +} +# endif + +static uint32_t +Stype(int32_t op, int32_t fct, int32_t rs1, int32_t rs2, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(simm12_p(imm)); + i.S.opcode = op; + i.S.imm4_0 = imm & 0x1f; + i.S.funct3 = fct; + i.S.rs1 = rs1; + i.S.rs2 = rs2; + i.S.imm11_5 = (imm >> 5) & 0x7f; + return i.w; +} + +static uint32_t +Btype(int32_t op, int32_t fct, int32_t rs1, int32_t rs2, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(fct & ~0x07)); + assert(!(rs1 & ~0x1f)); + assert(!(rs2 & ~0x1f)); + assert(!(imm & 1) && simm12_p(imm)); + i.B.opcode = op; + i.B.imm11 = (imm >> 11) & 0x1; + i.B.imm4_1 = (imm >> 1) & 0xf; + i.B.funct3 = fct; + i.B.rs1 = rs1; + i.B.rs2 = rs2; + i.B.imm10_5 = (imm >> 5) & 0x3f; + i.B.imm12 = (imm >> 12) & 0x1; + return i.w; +} + +static uint32_t +Utype(int32_t op, int32_t rd, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(simm20_p(imm)); + i.U.opcode = op; + i.U.rd = rd; + i.U.imm31_12= imm; + return i.w; +} + +static uint32_t +Jtype(int32_t op, int32_t rd, int32_t imm) +{ + instr_t i; + assert(!(op & ~0x7f)); + assert(!(rd & ~0x1f)); + assert(!(imm & 1) && imm <= 1048575 && imm >= -1048576); + i.J.opcode = op; + i.J.rd = rd; + i.J.imm19_12= (imm >> 12) & 0xff; + i.J.imm11 = (imm >> 11) & 0x1; + i.J.imm10_1 = (imm >> 1) & 0x3ff; + i.J.imm20 = (imm >> 20) & 0x1; + return i.w; +} + +/* + * RV32I Base Instruction Set + */ +#define _LUI(rd, imm) Utype(55, rd, imm) +#define _AUIPC(rd, imm) Utype(23, rd, imm) +#define _JAL(rd, imm) Jtype(111, rd, imm) +#define _JALR(rd, rs1, imm) Itype(103, rd, 0, rs1, imm) +#define _BEQ(rs1, rs2, imm) Btype(99, 0, rs1, rs2, imm) +#define _BNE(rs1, rs2, imm) Btype(99, 1, rs1, rs2, imm) +#define _BLT(rs1, rs2, imm) Btype(99, 4, rs1, rs2, imm) +#define _BGE(rs1, rs2, imm) Btype(99, 5, rs1, rs2, imm) +#define _BLTU(rs1, rs2, imm) Btype(99, 6, rs1, rs2, imm) +#define _BGEU(rs1, rs2, imm) Btype(99, 7, rs1, rs2, imm) +#define _LB(rd, rs1, imm) Itype(3, rd, 0, rs1, imm) +#define _LH(rd, rs1, imm) Itype(3, rd, 1, rs1, imm) +#define _LW(rd, rs1, imm) Itype(3, rd, 2, rs1, imm) +#define _LBU(rd, rs1, imm) Itype(3, rd, 4, rs1, imm) +#define _LHU(rd, rs1, imm) Itype(3, rd, 5, rs1, imm) +#define _SB(rs1, rs2, imm) Stype(35, 0, rs1, rs2, imm) +#define _SH(rs1, rs2, imm) Stype(35, 1, rs1, rs2, imm) +#define _SW(rs1, rs2, imm) Stype(35, 2, rs1, rs2, imm) +#define _ADDI(rd, rs1, imm) Itype(19, rd, 0, rs1, imm) +#define _SLTI(rd, rs1, imm) Itype(19, rd, 2, rs1, imm) +#define _SLTIU(rd, rs1, imm) Itype(19, rd, 3, rs1, imm) +#define _XORI(rd, rs1, imm) Itype(19, rd, 4, rs1, imm) +#define _ORI(rd, rs1, imm) Itype(19, rd, 6, rs1, imm) +#define _ANDI(rd, rs1, imm) Itype(19, rd, 7, rs1, imm) +#if __WORDSIZE == 32 +# define _SLLI(rd, rs1, imm) Rtype(19, rd, 1, rs1, imm, 0) +# define _SRLI(rd, rs1, imm) Rtype(19, rd, 5, rs1, imm, 0) +# define _SRAI(rd, rs1, imm) Rtype(19, rd, 5, rs1, imm, 32) +#endif +#define _ADD(rd, rs1, rs2) Rtype(51, rd, 0, rs1, rs2, 0) +#define _SUB(rd, rs1, rs2) Rtype(51, rd, 0, rs1, rs2, 32) +#define _SLL(rd, rs1, rs2) Rtype(51, rd, 1, rs1, rs2, 0) +#define _SLT(rd, rs1, rs2) Rtype(51, rd, 2, rs1, rs2, 0) +#define _SLTU(rd, rs1, rs2) Rtype(51, rd, 3, rs1, rs2, 0) +#define _XOR(rd, rs1, rs2) Rtype(51, rd, 4, rs1, rs2, 0) +#define _SRL(rd, rs1, rs2) Rtype(51, rd, 5, rs1, rs2, 0) +#define _SRA(rd, rs1, rs2) Rtype(51, rd, 5, rs1, rs2, 32) +#define _OR(rd, rs1, rs2) Rtype(51, rd, 6, rs1, rs2, 0) +#define _AND(rd, rs1, rs2) Rtype(51, rd, 7, rs1, rs2, 0) +#define _FENCE(imm) Itype( 15, 0, 0, 0, imm) +#define _FENCE_I(imm) Itype( 15, 0, 1, 0, imm) +#define _ECALL() Itype(115, 0, 0, 0, 0) +#define _EBREAK() Itype(115, 0, 0, 0, 1) +#define _CSRRW(rd, rs1, csr) Itype(115, rd, 1, rs1, csr) +#define _CSRRS(rd, rs1, csr) Itype(115, rd, 2, rs1, csr) +#define _CSRRC(rd, rs1, csr) Itype(115, rd, 3, rs1, csr) +#define _CSRRWI(rd, zimm, csr) Itype(115, rd, 5, zimm, csr) +#define _CSRRSI(rd, zimm, csr) Itype(115, rd, 6, zimm, csr) +#define _CSRRCI(rd, zimm, csr) Itype(115, rd, 7, zimm, csr) +/* + * RV64I Base Instruction Set (in addition to RV32I) + */ +#define _LWU(rd, rs1, imm) Itype(3, rd, 6, rs1, imm) +#define _LD(rd, rs1, imm) Itype(3, rd, 3, rs1, imm) +#define _SD(rs1, rs2, imm) Stype(35, 3, rs1, rs2, imm) +#if __WORDSIZE == 64 +# define _SLLI(rd, rs1, sh) IStype(19, rd, 1, rs1, sh, 0) +# define _SRLI(rd, rs1, sh) IStype(19, rd, 5, rs1, sh, 0) +# define _SRAI(rd, rs1, sh) IStype(19, rd, 5, rs1, sh, 16) +#endif +#define _ADDIW(rd, rs1, imm) Itype(27, rd, 0, rs1, imm) +#define _SLLIW(rd, rs1, imm) Rtype(27, rd, 1, rs1, imm, 0) +#define _SRLIW(rd, rs1, imm) Rtype(27, rd, 3, rs1, imm, 0) +#define _SRAIW(rd, rs1, imm) Rtype(27, rd, 3, rs1, imm, 32) +#define _ADDW(rd, rs1, imm) Rtype(59, rd, 0, rs1, imm, 0) +#define _SUBW(rd, rs1, imm) Rtype(59, rd, 0, rs1, imm, 32) +#define _SLLW(rd, rs1, imm) Rtype(59, rd, 1, rs1, imm, 0) +#define _SRLW(rd, rs1, imm) Rtype(59, rd, 5, rs1, imm, 0) +#define _SRAW(rd, rs1, imm) Rtype(59, rd, 5, rs1, imm, 32) +/* + * RV32M Standard Extension + */ +#define _MUL(rd, rs1, rs2) Rtype(51, rd, 0, rs1, rs2, 1) +#define _MULH(rd, rs1, rs2) Rtype(51, rd, 1, rs1, rs2, 1) +#define _MULHSU(rd, rs1, rs2) Rtype(51, rd, 2, rs1, rs2, 1) +#define _MULHU(rd, rs1, rs2) Rtype(51, rd, 3, rs1, rs2, 1) +#define _DIV(rd, rs1, rs2) Rtype(51, rd, 4, rs1, rs2, 1) +#define _DIVU(rd, rs1, rs2) Rtype(51, rd, 5, rs1, rs2, 1) +#define _REM(rd, rs1, rs2) Rtype(51, rd, 6, rs1, rs2, 1) +#define _REMU(rd, rs1, rs2) Rtype(51, rd, 7, rs1, rs2, 1) +/* + * RV64M Standard Extension (in addition to RV32M) + */ +#define _MULW(rd, rs1, rs2) Rtype(59, rd, 0, rs1, rs2, 1) +#define _DIVW(rd, rs1, rs2) Rtype(59, rd, 4, rs1, rs2, 1) +#define _DIVUW(rd, rs1, rs2) Rtype(59, rd, 5, rs1, rs2, 1) +#define _REMW(rd, rs1, rs2) Rtype(59, rd, 6, rs1, rs2, 1) +#define _REMUW(rd, rs1, rs2) Rtype(59, rd, 7, rs1, rs2, 1) +/* + * RV32A Standard Extension + */ +#define _LR_W(rd, rs1, rl, aq) R4type(47, rd, 2, rs1, 0, rl, aq, 2) +#define _SC_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 3) +#define _AMOSWAP_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 1) +#define _AMOADD_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 0) +#define _AMOXOR_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 4) +#define _AMOAND_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 12) +#define _AMOOR_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 8) +#define _AMOMIN_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 16) +#define _AMOMAX_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 20) +#define _AMOMINU_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 24) +#define _AMOMAXU_W(rd, rs1, rs2, rl, aq) R4type(47, rd, 2, rs1, rs2, rl, aq, 28) +/* + * RV64A Standard Extension (in addition to RV32A) + */ +#define _LR_D(rd, rs1, rl, aq) R4type(47, rd, 3, rs1, 0, rl, aq, 2) +#define _SC_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 3) +#define _AMOSWAP_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 1) +#define _AMOADD_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 0) +#define _AMOXOR_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 4) +#define _AMOAND_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 12) +#define _AMOOR_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 8) +#define _AMOMIN_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 16) +#define _AMOMAX_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 20) +#define _AMOMINU_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 24) +#define _AMOMAXU_D(rd, rs1, rs2, rl, aq) R4type(47, rd, 3, rs1, rs2, rl, aq, 28) +/* + * Pseudo Instructions + */ +#define _NOP() _ADDI((jit_gpr_regno(_ZERO)),\ + (jit_gpr_regno(_ZERO)), 0) +#define _MV(r0, r1) _ADDI(r0, r1, 0) +#define _NOT(r0, r1) _XORI(r0, r1, -1) +#define _NEG(r0, r1) _SUB(r0, (jit_gpr_regno(_ZERO)), r1) +#define _NEGW(r0, r1) _SUBW(r0, (jit_gpr_regno(_ZERO)), r1) +#define _SEXT_W(r0, r1) _ADDIW(r0, r1, 0) +#define _RET() _JALR((jit_gpr_regno(_ZERO)),\ + (jit_gpr_regno(_RA)), 0) + + + +// Help to make all easier +#define em_wp(jit, inst) emit_u32_with_pool(jit, inst) + +/* + * JIT INSTRUCTIONS + */ + +// Binary ALU operations +static void addr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void addcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void addxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +static void subr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void subcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void subxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +static void muli(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void mulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); + +static void divr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void divr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +static void remi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void remr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void remi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void remr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); + +static void andr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void andi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void orr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void xorr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void xori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void lshi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void lshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void rshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void rshi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0); +static void rshr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void rshi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0); + + +// Four operand ALU operations +static void qmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qmulr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qmuli(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); +static void qmuli_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); + +static void qdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qdivr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3); +static void qdivi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); +static void qdivi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0); + + +// Unary ALU operations +static void negr(jit_state_t *_jit, int32_t r0, int32_t r1); +static void comr(jit_state_t *_jit, int32_t r0, int32_t r1); + + +// Transfer operations +static void movr(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movi(jit_state_t *_jit, int32_t r0, jit_word_t i0); + +static uint64_t patch_load_from_pool(uint64_t instrs, uint32_t off); +static jit_reloc_t emit_load_from_pool(jit_state_t *_jit, uint64_t insts); +static jit_reloc_t mov_addr(jit_state_t *_jit, int32_t r0); +static jit_reloc_t movi_from_pool(jit_state_t *_jit, int32_t r0); + +static void extr_c(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_uc(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_s(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_us(jit_state_t *_jit, int32_t r0, int32_t r1); + +# if __WORDSIZE == 64 +static void extr_i(jit_state_t *_jit, int32_t r0, int32_t r1); +static void extr_ui(jit_state_t *_jit, int32_t r0, int32_t r1); +#endif + + +// Branch instructions +static uint32_t patch_cc_jump(uint32_t inst, int32_t offset); +static jit_reloc_t emit_cc_jump(jit_state_t *_jit, uint32_t inst); + +static jit_reloc_t bltr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blti(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bltr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blti_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bler(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blei(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bler_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t blei_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t beqr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t beqi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bger(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgei(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bger_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgei_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bgtr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgti(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bgtr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgti_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bner(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bnei(jit_state_t *_jit, int32_t r0, jit_word_t i1); + +static jit_reloc_t bmsr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bmsi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bmcr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bmci(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t boaddr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t boaddi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t boaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t boaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxaddr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxaddi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bosubr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bosubi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bosubr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bosubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxsubr(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxsubi(jit_state_t *_jit, int32_t r0, jit_word_t i1); +static jit_reloc_t bxsubr_u(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bxsubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i1); + + +// Store operations +static void str_c(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_uc(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_s(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_i(jit_state_t *_jit, int32_t r0, int32_t r1); +#if __WORDSIZE == 64 +static void str_i(jit_state_t *_jit, int32_t r0, int32_t r1); +#endif + +static void sti_c(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void sti_s(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void sti_i(jit_state_t *_jit, jit_word_t i0, int32_t r0); +#if __WORDSIZE == 64 +static void sti_l(jit_state_t *_jit, jit_word_t i0, int32_t r0); +#endif + +static void stxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +# if __WORDSIZE == 64 +static void stxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +#endif + +static void stxi_c(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +static void stxi_s(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +static void stxi_i(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +# if __WORDSIZE == 64 +static void stxi_l(jit_state_t *_jit,jit_word_t i0,int32_t r0,int32_t r1); +# endif + + +// Load operations +static void ldr_c(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_uc(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_s(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_us(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_i(jit_state_t *_jit, int32_t r0, int32_t r1); +# if __WORDSIZE == 64 +static void ldr_ui(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_l(jit_state_t *_jit, int32_t r0, int32_t r1); +# endif + +static void ldi_c(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_uc(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_s(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_us(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_i(jit_state_t *_jit, int32_t r0, jit_word_t i0); +# if __WORDSIZE == 64 +static void ldi_ui(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldi_l(jit_state_t *_jit, int32_t r0, jit_word_t i0); +# endif + +static void ldxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_uc(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_us(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +# if __WORDSIZE == 64 +static void ldxr_ui(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +#endif + +static void ldxi_c(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_uc(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_us(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_s(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_i(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +# if __WORDSIZE == 64 +static void ldxi_ui(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldxi_l(jit_state_t *_jit,int32_t r0,int32_t r1,jit_word_t i0); +#endif + + +// Argument management +//static void pushr(jit_state_t *_jit, int32_t r0); +//static void popr(jit_state_t *_jit, int32_t r0); +static void ret(jit_state_t *_jit); +static void retr(jit_state_t *_jit, int32_t r0); +static void reti(jit_state_t *_jit, jit_word_t i0); +static void retval_c(jit_state_t *_jit, int32_t r0); +static void retval_uc(jit_state_t *_jit, int32_t r0); +static void retval_s(jit_state_t *_jit, int32_t r0); +static void retval_us(jit_state_t *_jit, int32_t r0); +static void retval_i(jit_state_t *_jit, int32_t r0); +# if __WORDSIZE == 64 +static void retval_ui(jit_state_t *_jit, int32_t r0); +static void retval_l(jit_state_t *_jit, int32_t r0); +#endif + +// Jump and return +static uint32_t patch_jump(uint32_t inst, int32_t offset); +static jit_reloc_t emit_jump(jit_state_t *_jit, uint32_t inst); + +static void callr(jit_state_t *_jit, int32_t r0); +static void calli(jit_state_t *_jit, jit_word_t i0); +static void jmpi_with_link(jit_state_t *_jit, jit_word_t i0); +static void pop_link_register(jit_state_t *_jit); +static void push_link_register(jit_state_t *_jit); +static void jmpr(jit_state_t *_jit, int32_t r0); +static void jmpi(jit_state_t *_jit, jit_word_t i0); +static jit_reloc_t jmp(jit_state_t *_jit); + + +// Atomic operations +static void ldr_atomic(jit_state_t *_jit, int32_t dst, int32_t loc); +static void str_atomic(jit_state_t *_jit, int32_t loc, int32_t val); +static void swap_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, + int32_t val); +static void cas_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, + int32_t expected, int32_t desired); + +// Byte swapping operations +static void bswapr_us(jit_state_t *_jit, int32_t r0, int32_t r1); +static void bswapr_ui(jit_state_t *_jit, int32_t r0, int32_t r1); +# if __WORDSIZE == 64 +static void +bswapr_ul(jit_state_t *_jit, int32_t r0, int32_t r1); +#endif + +// Others +static void nop(jit_state_t *_jit, int32_t im); +static void mfence(jit_state_t *_jit); +static void breakpoint(jit_state_t *_jit); + + + +/* + * Binary ALU operations + */ +static void +addr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _ADD(r0, r1, r2)); +} +static void +addi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _ADDI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + addr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +addcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + // TODO: Not sure if this is correct + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), jit_gpr_regno(t0), r1)); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addr(_jit, r0, r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r0, r1)); + } +} + +static void +addci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), jit_gpr_regno(t0), r1)); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addi(_jit, r0, r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r0, r1)); + } +} + +static void +addxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + addcr(_jit, r0, r1, r2); + addcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +addxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + addci(_jit, r0, r1, i0); + addcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +subr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SUB(r0, r1, r2)); +} + +static void +subi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + addi(_jit, r0, r1, -i0); +} + +static void +subcr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + subr(_jit, jit_gpr_regno(t0), r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addr(_jit, r0, r1, r2); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, r0)); + } +} + +static void +subci(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + + jit_gpr_t t0; + if (r0 == r1) { + t0 = get_temp_gpr(_jit); + subi(_jit, jit_gpr_regno(t0), r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + else { + addi(_jit, r0, r1, i0); + em_wp(_jit, _SLTU(jit_gpr_regno(JIT_CARRY), r1, r0)); + } +} + +static void +subxr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + subcr(_jit, r0, r1, r2); + subcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +subxi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0; + t0 = get_temp_gpr(_jit); + movr(_jit, jit_gpr_regno(t0), jit_gpr_regno(JIT_CARRY)); + subci(_jit, r0, r1, i0); + subcr(_jit, r0, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +muli(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + mulr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +mulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _MUL(r0, r1, r2)); +} + +static void +divr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _DIV(r0, r1, r2)); +} + +static void +divi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + divr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +divr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _DIVU(r0, r1, r2)); +} + +static void +divi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + divr_u(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +remi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + remr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +remr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _REM(r0, r1, r2)); +} +static void +remi_u(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + remr_u(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +remr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _REMU(r0, r1, r2)); +} + +static void +andr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _AND(r0, r1, r2)); +} + +static void +andi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _ANDI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + em_wp(_jit, _AND(r0, r1, jit_gpr_regno(t0))); + unget_temp_gpr(_jit); + } +} + +static void +orr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _OR(r0, r1, r2)); +} + +static void +ori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _ORI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + orr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +xorr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _XOR(r0, r1, r2)); +} + +static void +xori(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _XORI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + xorr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +lshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SLL(r0, r1, r2)); +} + +static void +lshi(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _SLLI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + lshr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +rshr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SRA(r0, r1, r2)); +} + +static void +rshi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _SRAI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + rshr(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +rshr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _SRL(r0, r1, r2)); +} + +static void +rshi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t i0) +{ + if (simm12_p(i0)){ + em_wp(_jit, _SRLI(r0, r1, i0)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + rshr_u(_jit, r0, r1, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + + +/* + * Four operand ALU operations + */ +static void +iqmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3, + jit_bool_t sign){ + if(r0 == r2 || r0 == r3){ + jit_gpr_t t0 = get_temp_gpr(_jit); + em_wp(_jit, _MUL(jit_gpr_regno(t0), r2, r3)); + if(sign) + em_wp(_jit, _MULH(r1, r2, r3)); + else + em_wp(_jit, _MULHU(r1, r2, r3)); + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + em_wp(_jit, _MUL(r0, r2, r3)); + if(sign) + em_wp(_jit, _MULH(r1, r2, r3)); + else + em_wp(_jit, _MULHU(r1, r2, r3)); +} + +static void +qmulr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqmulr(_jit, r0, r1, r2, r3, 1); +} + +static void +qmulr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqmulr(_jit, r0, r1, r2, r3, 0); +} + +static void +qmuli(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqmulr(_jit, r0, r1, r2, jit_gpr_regno(t0), 1); + unget_temp_gpr(_jit); +} + +static void +qmuli_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqmulr(_jit, r0, r1, r2, jit_gpr_regno(t0), 0); + unget_temp_gpr(_jit); +} + +static void +iqdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3, + jit_bool_t sign){ + if(r0 == r2 || r0 == r3){ + jit_gpr_t t0 = get_temp_gpr(_jit); + if(sign){ + em_wp(_jit, _DIV(jit_gpr_regno(t0), r2, r3)); + em_wp(_jit, _REM(r1, r2, r3)); + } else { + em_wp(_jit, _DIVU(jit_gpr_regno(t0), r2, r3)); + em_wp(_jit, _REMU(r1, r2, r3)); + } + movr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } + if(sign){ + em_wp(_jit, _DIV(r0, r2, r3)); + em_wp(_jit, _REM(r1, r2, r3)); + } else { + em_wp(_jit, _DIVU(r0, r2, r3)); + em_wp(_jit, _REMU(r1, r2, r3)); + } +} + +static void +qdivr(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqdivr(_jit, r0, r1, r2, r3, 1); +} + +static void +qdivr_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, int32_t r3) +{ + iqdivr(_jit, r0, r1, r2, r3, 0); +} + +static void +qdivi(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqdivr(_jit, r0, r1, r2, jit_gpr_regno(t0), 1); + unget_temp_gpr(_jit); +} + +static void +qdivi_u(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + iqdivr(_jit, r0, r1, r2, jit_gpr_regno(t0), 0); + unget_temp_gpr(_jit); +} + + +/* + * Unary ALU operations + */ +static void +negr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _NEG(r0, r1)); +} + +static void +comr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _NOT(r0, r1)); +} + + +/* + * Branch instructions + */ +static uint32_t +patch_cc_jump(uint32_t inst, int32_t offset){ + instr_t i; + i.w = inst; + i.B.imm11 = (offset >> 11) & 0x1; + i.B.imm4_1 = (offset >> 1) & 0xf; + i.B.imm10_5 = (offset >> 5) & 0x3f; + i.B.imm12 = (offset >> 12) & 0x1; + return i.w; +} + +static jit_reloc_t +emit_cc_jump(jit_state_t *_jit, uint32_t inst) +{ + while (1) { + uint8_t *pc_base = _jit->pc.uc; // Offset is from current PC + int32_t off = (uint8_t*)jit_address(_jit) - pc_base; + jit_reloc_t ret = + jit_reloc (_jit, JIT_RELOC_JCC_WITH_VENEER, 0, _jit->pc.uc, pc_base, 0); + uint8_t cc_jump_width = 12; + if (add_pending_literal(_jit, ret, cc_jump_width - 1)) { + em_wp(_jit, patch_cc_jump(inst, off)); + return ret; + } + } +} + +static jit_reloc_t +bltr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BLT(r0, r1, 0)); +} + +static jit_reloc_t +blti(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bltr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BLTU(r0, r1, 0)); +} + +static jit_reloc_t +blti_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bltr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGE(r1, r0, 0)); +} + +static jit_reloc_t +blei(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bler(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGEU(r1, r0, 0)); +} + +static jit_reloc_t +blei_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bler_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +beqr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BEQ(r0, r1, 0)); +} + +static jit_reloc_t +beqi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = beqr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGE(r0, r1, 0)); +} + +static jit_reloc_t +bgei(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bger(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BGEU(r0, r1, 0)); +} + +static jit_reloc_t +bgei_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bger_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bgtr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr(_jit, r1, r0); +} + +static jit_reloc_t +bgti(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bgtr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bgtr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr_u(_jit, r1, r0); +} + +static jit_reloc_t +bgti_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bgtr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bner(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return emit_cc_jump(_jit, _BNE(r0, r1, 0)); +} + +static jit_reloc_t +bnei(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bner(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmsr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andr(_jit, jit_gpr_regno(t0), r0, r1); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmsi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andi(_jit, jit_gpr_regno(t0), r0, i0); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmcr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andr(_jit, jit_gpr_regno(t0), r0, r1); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bmci(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + andi(_jit, jit_gpr_regno(t0), r0, i0); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t0), jit_gpr_regno(_ZERO)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + // NOTE: We need tons of temporaries because RISC-V doesn't provide any + // easy way to solve this. We need to do it in software. + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = boaddr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + + jit_reloc_t ret = bnei(_jit, jit_gpr_regno(t1), 0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +boaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = boaddr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxaddr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + addr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), jit_gpr_regno(t0), r0)); + movr(_jit, r0, jit_gpr_regno(t0)); + + jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxaddi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxaddr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = bner(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bosubr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 1); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bosubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bosubr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + jit_gpr_t t2 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTI(jit_gpr_regno(t1), r1, 0)); + em_wp(_jit, _SLT(jit_gpr_regno(t2), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqr(_jit, jit_gpr_regno(t1), jit_gpr_regno(t2)); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxsubr(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubr_u(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + jit_gpr_t t1 = get_temp_gpr(_jit); + + subr(_jit, jit_gpr_regno(t0), r0, r1); + + em_wp(_jit, _SLTU(jit_gpr_regno(t1), r0, jit_gpr_regno(t0))); + movr(_jit, r0, jit_gpr_regno(t0)); + jit_reloc_t ret = beqi(_jit, jit_gpr_regno(t1), 0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bxsubi_u(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jit_reloc_t ret = bxsubr_u(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + return ret; +} + + +/* + * Transfer operations + */ +static void +movr(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + if (r0 != r1) + em_wp(_jit, _MV(r0, r1)); +} + +static void +movi(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + int32_t srcreg = jit_gpr_regno(_ZERO); + if (simm32_p(i0)){ + + int64_t hi = ((i0 + 0x800) >> 12) & 0xFFFFF; + int64_t lo = (int32_t)i0<<20>>20; + + if(hi){ + em_wp(_jit, _LUI(r0, hi)); + srcreg = r0; + } + + if(lo || hi == 0){ + em_wp(_jit, _ADDI(r0, srcreg, lo)); + } + + } else { + // 64 bits: load in various steps + // lui, addi, slli, addi, slli, addi, slli, addi + int64_t hh = (i0>>44); + int64_t hl = (i0>>33) - (hh<<11); + int64_t lh = (i0>>22) - ((hh<<22) + (hl<<11)); + int64_t lm = (i0>>11) - ((hh<<33) + (hl<<22) + (lh<<11)); + int64_t ll = i0 - ((hh<<44) + (hl<<33) + (lh<<22) + (lm<<11)); + + + em_wp(_jit, _LUI(r0, hh)); + em_wp(_jit, _SLLI(r0, r0, 32)); + em_wp(_jit, _SRLI(r0, r0, 33)); + em_wp(_jit, _ADDI(r0, r0, hl)); + + em_wp(_jit, _SLLI(r0, r0, 11)); + em_wp(_jit, _ADDI(r0, r0, lh)); + + em_wp(_jit, _SLLI(r0, r0, 11)); + em_wp(_jit, _ADDI(r0, r0, lm)); + + em_wp(_jit, _SLLI(r0, r0, 11)); + em_wp(_jit, _ADDI(r0, r0, ll)); + } +} + +typedef union{ + struct{ + instr_t auipc; + instr_t load; // `ld` in RV64 and `lw` in RV32 + } inst; + uint64_t l; +} load_from_pool_t; + +static uint64_t +patch_load_from_pool(uint64_t instrs, int32_t off){ + + load_from_pool_t out, in; + int32_t hi20 = off >>12; + in.l = instrs; + out.inst.auipc.w = _AUIPC(in.inst.auipc.U.rd, hi20); + out.inst.load.w = Itype(in.inst.load.I.opcode, // `ld` or `lw` + in.inst.load.I.rd, + in.inst.load.I.funct3, + in.inst.load.I.rs1, + off - (hi20<<12)); + return out.l; +} + +static jit_reloc_t +emit_load_from_pool(jit_state_t *_jit, uint64_t insts) +{ + while (1) { + uint8_t *pc_base = _jit->pc.uc; // Offset is from current PC + int32_t off = (_jit->pc.uc - pc_base); + jit_reloc_t ret = + jit_reloc (_jit, JIT_RELOC_LOAD_FROM_POOL, 0, _jit->pc.uc, pc_base, 0); + uint8_t load_from_pool_width = 32; + if (add_pending_literal(_jit, ret, load_from_pool_width)) { + emit_u64(_jit, patch_load_from_pool(insts, off)); + return ret; + } + } +} +static jit_reloc_t +movi_from_pool(jit_state_t *_jit, int32_t r0) +{ + load_from_pool_t insts; + insts.inst.auipc.w = _AUIPC(r0, 0); +#if __WORDSIZE == 64 + insts.inst.load.w = _LD(r0, r0, 0); +#elif __WORDSIZE == 32 + insts.inst.load.w = _LW(r0, r0, 0); +#endif + return emit_load_from_pool(_jit, insts.l); +} +static jit_reloc_t +mov_addr(jit_state_t *_jit, int32_t r0) +{ + return movi_from_pool(_jit, r0); +} + + +static void +extr_c(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 8; + lshi(_jit, r0, r1, rot); + rshi(_jit, r0, r0, rot); +} + +static void +extr_uc(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 8; + lshi(_jit, r0, r1, rot); + rshi_u(_jit, r0, r0, rot); +} + +static void +extr_s(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 16; + lshi(_jit, r0, r1, rot); + rshi(_jit, r0, r0, rot); +} + +static void +extr_us(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 16; + lshi(_jit, r0, r1, rot); + rshi_u(_jit, r0, r0, rot); +} + +# if __WORDSIZE == 64 +static void +extr_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 32; + lshi(_jit, r0, r1, rot); + rshi(_jit, r0, r0, rot); +} +static void +extr_ui(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int rot = __WORDSIZE - 32; + lshi(_jit, r0, r1, rot); + rshi_u(_jit, r0, r0, rot); +} +#endif + +/* + * Store operations + */ +static void +str_c(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SB(r0, r1, 0)); +} +static void +str_uc(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SB(r0, r1, 0)); +} +static void +str_s(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SH(r0, r1, 0)); +} +static void +str_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SW(r0, r1, 0)); +} +#if __WORDSIZE == 64 +static void +str_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _SD(r0, r1, 0)); +} +#endif + +static void +sti_c(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_c(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} + +static void +sti_s(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_s(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} + +static void +sti_i(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_i(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} + +#if __WORDSIZE == 64 +static void +sti_l(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_l(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} +#endif + +static void +stxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_c(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} + +static void +stxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_s(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} + +static void +stxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_i(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} + +# if __WORDSIZE == 64 +static void +stxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_l(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} +#endif + +static void +stxi_c(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SB(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_c(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + + +static void +stxi_s(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SH(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_s(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + + +static void +stxi_i(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SW(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_i(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + +# if __WORDSIZE == 64 +static void +stxi_l(jit_state_t *_jit,jit_word_t i0,int32_t r0,int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _SD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_l(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} +# endif + + +/* + * Load operations + */ +static void +ldr_c(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LB(r0, r1, 0)); +} + +static void +ldr_uc(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LBU(r0, r1, 0)); +} + +static void +ldr_s(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LH(r0, r1, 0)); +} + +static void +ldr_us(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LHU(r0, r1, 0)); +} + +static void +ldr_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LW(r0, r1, 0)); +} + +# if __WORDSIZE == 64 +static void +ldr_ui(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LWU(r0, r1, 0)); +} + +static void +ldr_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _LD(r0, r1, 0)); +} +# endif + + +static void +ldi_c(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_c(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_uc(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_uc(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_s(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_s(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_us(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_us(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + + +static void +ldi_i(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_i(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +# if __WORDSIZE == 64 +static void +ldi_ui(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_ui(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldi_l(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_l(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +#endif + + + + +static void +ldxr_c(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_c(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_uc(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_uc(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_s(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_s(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_us(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_us(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_i(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_i(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +# if __WORDSIZE == 64 +static void +ldxr_ui(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_ui(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_l(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_l(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +#endif + + + + +static void +ldxi_c(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_c(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_uc(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_uc(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_us(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_us(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_s(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_s(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_i(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_i(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +# if __WORDSIZE == 64 +static void +ldxi_ui(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_ui(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldxi_l(jit_state_t *_jit,int32_t r0,int32_t r1,jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _LD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_l(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +#endif + + +/* + * Argument management + */ + +// static void +// pushr(jit_state_t *_jit, int32_t r0) +// { +// #if __WORDSIZE == 64 +// addi(jit_gpr_regno(_SP), -8); +// em_wp(_SD(r0, jit_gpr_regno(_SP), 0)); +// #elif __WORDSIZE == 32 +// addi(jit_gpr_regno(_SP), -4); +// em_wp(_SW(r0, jit_gpr_regno(_SP), 0)); +// #endif +// } +// static void +// popr(jit_state_t *_jit, int32_t r0) +// { +// #if __WORDSIZE == 64 +// em_wp(_jit, _LD(r0, jit_gpr_regno(_SP), 0)); +// addi(jit_gpr_regno(_SP), 8); +// #elif __WORDSIZE == 32 +// em_wp(_jit, _LW(r0, jit_gpr_regno(_SP), 0)); +// addi(jit_gpr_regno(_SP), 4); +// #endif +// } + +static void +ret(jit_state_t *_jit) +{ + em_wp(_jit, _RET()); +} + +static void +retr(jit_state_t *_jit, int32_t r0) +{ + movr(_jit, jit_gpr_regno(_A0), r0); + ret(_jit); +} + +static void +reti(jit_state_t *_jit, jit_word_t i0) +{ + movi(_jit, jit_gpr_regno(_A0), i0); + ret(_jit); +} + +static void +retval_c(jit_state_t *_jit, int32_t r0) +{ + extr_c(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_uc(jit_state_t *_jit, int32_t r0) +{ + extr_uc(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_s(jit_state_t *_jit, int32_t r0) +{ + extr_s(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_us(jit_state_t *_jit, int32_t r0) +{ + extr_us(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_i(jit_state_t *_jit, int32_t r0) +{ + extr_i(_jit, r0, jit_gpr_regno(_A0)); +} + +# if __WORDSIZE == 64 +static void +retval_ui(jit_state_t *_jit, int32_t r0) +{ + extr_ui(_jit, r0, jit_gpr_regno(_A0)); +} + +static void +retval_l(jit_state_t *_jit, int32_t r0) +{ + movr(_jit, r0, jit_gpr_regno(_A0)); +} +#endif + +/* + * Jump and return instructions + */ +static uint32_t +patch_jump(uint32_t inst, int32_t offset) +{ + instr_t i; + i.w = inst; + i.J.imm20 = (offset >> 20) & 0x1; + i.J.imm19_12= (offset >> 12) & 0xff; + i.J.imm11 = (offset >> 11) & 0x1; + i.J.imm10_1 = (offset >> 1) & 0x3ff; + return i.w; +} +static jit_reloc_t +emit_jump(jit_state_t *_jit, uint32_t inst) +{ + while (1) { + uint8_t *pc_base = _jit->pc.uc; // Offset is from current PC + int32_t off = (uint8_t*)jit_address(_jit) - pc_base; + jit_reloc_t ret = + jit_reloc (_jit, JIT_RELOC_JMP_WITH_VENEER, 0, _jit->pc.uc, pc_base, 0); + uint8_t jump_width = 20; + if (add_pending_literal(_jit, ret, jump_width - 1)) { + em_wp(_jit, patch_jump(inst, off)); + return ret; + } + } +} + +static void +callr(jit_state_t *_jit, int32_t r0) +{ + em_wp(_jit, _JALR(jit_gpr_regno(_RA), r0, 0)); +} + +static void +calli(jit_state_t *_jit, jit_word_t i0) +{ + jit_word_t jumpoffset = i0 - (jit_word_t)(_jit->pc.uc); + if (simm20_p(jumpoffset)){ + em_wp(_jit, _JAL(jit_gpr_regno(_RA), jumpoffset)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + callr(_jit, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static void +jmpi_with_link(jit_state_t *_jit, jit_word_t i0) +{ + calli(_jit, i0); +} + +static void +pop_link_register(jit_state_t *_jit) +{ +} + +static void +push_link_register(jit_state_t *_jit) +{ +} + +static void +jmpr(jit_state_t *_jit, int32_t r0) +{ + em_wp(_jit, _JALR(jit_gpr_regno(_ZERO), r0, 0)); +} + +static void +jmpi(jit_state_t *_jit, jit_word_t i0) +{ + jit_word_t jumpoffset = i0 - (jit_word_t)(_jit->pc.uc); + if (simm20_p(jumpoffset)){ + em_wp(_jit, _JAL(jit_gpr_regno(_ZERO), jumpoffset)); + } else { + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + jmpr(_jit, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + +static jit_reloc_t +jmp(jit_state_t *_jit) +{ + return emit_jump(_jit, _JAL(jit_gpr_regno(_ZERO), 0)); +} + + + +/* + * Atomic operations + */ + +static void +ldr_atomic(jit_state_t *_jit, int32_t dst, int32_t loc) +{ + em_wp(_jit, _FENCE(0xFF)); + ldr_i(_jit, dst, loc); + em_wp(_jit, _FENCE(0xFF)); +} + +static void +str_atomic(jit_state_t *_jit, int32_t loc, int32_t val) +{ + em_wp(_jit, _FENCE(0xFF)); + str_i(_jit, loc, val); + em_wp(_jit, _FENCE(0xFF)); +} + +static void +swap_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, int32_t val) +{ +#if __WORDSIZE == 64 + em_wp(_jit, _AMOSWAP_D(dst, loc, val, 1, 1)); +#elif __WORDSIZE == 32 + em_wp(_jit, _AMOSWAP_W(dst, loc, val, 1, 1)); +#endif +} + +static void +cas_atomic(jit_state_t *_jit, int32_t dst, int32_t loc, int32_t expected, + int32_t desired) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + void *retry = jit_address(_jit); + +#if __WORDSIZE == 64 + em_wp(_jit, _LR_D(t0, loc, 0,0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _LR_W(t0, loc, 0,0)); +#endif + + jit_reloc_t fail = bner(_jit, t0, expected); + +#if __WORDSIZE == 64 + em_wp(_jit, _SC_D(t1, desired, loc, 0,0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _SC_W(t1, desired, loc, 0,0)); +#endif + + jit_patch_there(_jit, bner(_jit, t1, jit_gpr_regno(_ZERO)), retry); + + jit_patch_here(_jit, fail); + em_wp(_jit, _FENCE(0xFF)); + movr(_jit, dst, t0); + + unget_temp_gpr(_jit); + unget_temp_gpr(_jit); +} + + +/* + * Byte swapping operations + * RISC-V Doesn't provide them by default. + * There's a B extension (Standard Extension for Bit Manipulation) draft, but + * it's not official yet: + * https://github.com/riscv/riscv-bitmanip + * Meanwhile, we need to implement them in software. + */ +static void +bswapr_uany(jit_state_t *_jit, int32_t r0, int32_t r1, size_t size) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + andi(_jit, r0, r1, 0xFF); + for(int i = 1; i < size; i++){ + lshi(_jit, r0, r0, 8); + rshi(_jit, t0, r1, 8*i); + andi(_jit, t0, t0, 0xFF); + orr(_jit, r0, r0, t0); + } + unget_temp_gpr(_jit); +} + +static void +bswapr_us(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + bswapr_uany(_jit, r0, r1, 2); +} + +static void +bswapr_ui(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + bswapr_uany(_jit, r0, r1, 4); +} + +# if __WORDSIZE == 64 +static void +bswapr_ul(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + bswapr_uany(_jit, r0, r1, 8); +} +#endif + + + +/* + * Others + * TODO + */ +static void +nop(jit_state_t *_jit, int32_t im) +{ + for (; im > 0; im -= 4) + em_wp(_jit, _NOP()); + assert(im == 0); +} +static void +mfence(jit_state_t *_jit) +{ + // TODO: we may need it for atomic operations? +} + +static void +breakpoint(jit_state_t *_jit) +{ + em_wp(_jit, _EBREAK()); +} diff --git a/lightening/riscv-fpu.c b/lightening/riscv-fpu.c new file mode 100644 index 000000000..315ed8d14 --- /dev/null +++ b/lightening/riscv-fpu.c @@ -0,0 +1,858 @@ +/* + * RV32F Standard Extension + */ +#define _FLW(rd, rs1, im) Itype(7, rd, 2, rs1, im) +#define _FSW(rs1, rs2, imm) Stype(39, 2, rs1, rs2, imm) +#define _FMADD_S(rd, rs1, rs2, rs3) R4type(67, rd, 0, rs1, rs2, 0, rs3) +#define _FMSUB_S(rd, rs1, rs2, rs3) R4type(71, rd, 0, rs1, rs2, 0, rs3) +#define _FNMSUB_S(rd, rs1, rs2, rs3) R4type(75, rd, 0, rs1, rs2, 0, rs3) +#define _FNMADD_S(rd, rs1, rs2, rs3) R4type(79, rd, 0, rs1, rs2, 0, rs3) +#define _FADD_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 0) +#define _FSUB_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 4) +#define _FMUL_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 8) +#define _FDIV_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 12) +#define _FSQRT_S(rd, rs1) Rtype(83, rd, 0, rs1, 0, 44) +#define _FSGNJ_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 16) +#define _FSGNJN_S(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 16) +#define _FSGNJX_S(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 16) +#define _FMIN_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 20) +#define _FMAX_S(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 20) +#define _FCVT_W_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 96) +#define _FCVT_WU_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 96) +#define _FMV_X_W(rd, rs1) Rtype(83, rd, 0, rs1, 0, 112) +#define _FEQ_S(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 80) +#define _FLT_S(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 80) +#define _FLE_S(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 80) +#define _FCLASS_S(rd, rs1) Rtype(83, rd, 1, rs1, 0, 112) +#define _FCVT_S_W(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 104) +#define _FCVT_S_WU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 104) +#define _FMV_W_X(rd, rs1) Rtype(83, rd, 0, rs1, 0, 120) +/* + * RV64F Standard Extension (in addition to RV32F) + */ +#define _FCVT_L_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 96) +#define _FCVT_LU_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 96) +#define _FCVT_S_L(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 104) +#define _FCVT_S_LU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 104) +/* + * RV32D Standard Extension + */ +#define _FLD(rd, rs1, im) Itype(7, rd, 3, rs1, im) +#define _FSD(rs1, rs2, imm) Stype(39, 3, rs1, rs2, imm) +#define _FMADD_D(rd, rs1, rs2, rs3) R4type(67, rd, 0, rs1, rs2, 1, rs3) +#define _FMSUB_D(rd, rs1, rs2, rs3) R4type(71, rd, 0, rs1, rs2, 1, rs3) +#define _FNMSUB_D(rd, rs1, rs2, rs3) R4type(75, rd, 0, rs1, rs2, 1, rs3) +#define _FNMADD_D(rd, rs1, rs2, rs3) R4type(79, rd, 0, rs1, rs2, 1, rs3) +#define _FADD_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 1) +#define _FSUB_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 5) +#define _FMUL_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 9) +#define _FDIV_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 13) +#define _FSQRT_D(rd, rs1) Rtype(83, rd, 0, rs1, 0, 45) +#define _FSGNJ_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 17) +#define _FSGNJN_D(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 17) +#define _FSGNJX_D(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 17) +#define _FMIN_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 21) +#define _FMAX_D(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 21) +#define _FCVT_S_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 32) +#define _FCVT_D_S(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 33) +#define _FEQ_D(rd, rs1, rs2) Rtype(83, rd, 2, rs1, rs2, 81) +#define _FLT_D(rd, rs1, rs2) Rtype(83, rd, 1, rs1, rs2, 81) +#define _FLE_D(rd, rs1, rs2) Rtype(83, rd, 0, rs1, rs2, 81) +#define _FCLASS_D(rd, rs1) Rtype(83, rd, 1, rs1, 0, 113) +#define _FCVT_W_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 97) +#define _FCVT_WU_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 97) +#define _FCVT_D_W(rd, rs1, rm) Rtype(83, rd, rm, rs1, 0, 105) +#define _FCVT_D_WU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 1, 105) +/* + * RV64D Standard Extension (in addition to RV32D) + */ +#define _FCVT_L_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 97) +#define _FCVT_LU_D(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 97) +#define _FMV_X_D(rd, rs1) Rtype(83, rd, 0, rs1, 0, 113) +#define _FCVT_D_L(rd, rs1, rm) Rtype(83, rd, rm, rs1, 2, 105) +#define _FCVT_D_LU(rd, rs1, rm) Rtype(83, rd, rm, rs1, 3, 105) +#define _FMV_D_X(rd, rs1) Rtype(83, rd, 0, rs1, 0, 121) +/* + * Pseudo instructions + */ +#define _FMV_S(r0, r1) _FSGNJ_S(r0, r1, r1) +#define _FABS_S(r0, r1) _FSGNJX_S(r0, r1, r1) +#define _FNEG_S(r0, r1) _FSGNJN_S(r0, r1, r1) +#define _FMV_D(r0, r1) _FSGNJ_D(r0, r1, r1) +#define _FABS_D(r0, r1) _FSGNJX_D(r0, r1, r1) +#define _FNEG_D(r0, r1) _FSGNJN_D(r0, r1, r1) + +// Binary ALU operations +static void addr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void addr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void subr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void mulr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void mulr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void divr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); + +// Unary ALU operations +static void sqrtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void sqrtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void negr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void negr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void absr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void absr_d(jit_state_t *_jit, int32_t r0, int32_t r1); + +// Transfer operations +static void movr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movr_d(jit_state_t *_jit, int32_t r0, int32_t r1); + +// Argument management +static void retr_f(jit_state_t *_jit, int32_t u); +static void retr_d(jit_state_t *_jit, int32_t u); + +// Load operations +static void ldr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void ldi_f(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxi_f(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); +static void ldi_d(jit_state_t *_jit, int32_t r0, jit_word_t i0); +static void ldxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void ldxi_d(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0); + +// Store operations +static void str_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void str_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void sti_f(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void stxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxi_f(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); +static void sti_d(jit_state_t *_jit, jit_word_t i0, int32_t r0); +static void stxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2); +static void stxi_d(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1); + +// Branch instructions +static jit_reloc_t bltr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bler_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t beqr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bger_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bner_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunltr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunler_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t buneqr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunger_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bungtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bltgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bordr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunordr_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bltr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bler_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t beqr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bger_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bner_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunltr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunler_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t buneqr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bunger_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bungtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bltgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static jit_reloc_t bordr_d(jit_state_t *_jit, int32_t r0, int32_t r1); + +/* + * Binary ALU operations + */ +static void +addr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FADD_S(r0, r1, r2)); +} +static void +addr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FADD_D(r0, r1, r2)); +} +static void +subr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FSUB_S(r0, r1, r2)); +} +static void +subr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FSUB_D(r0, r1, r2)); +} +static void +mulr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FMUL_S(r0, r1, r2)); +} +static void +mulr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FMUL_D(r0, r1, r2)); +} +static void +divr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FDIV_S(r0, r1, r2)); +} +static void +divr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + em_wp(_jit, _FDIV_D(r0, r1, r2)); +} + +/* + * Unary ALU operations + */ +static void +sqrtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSQRT_S(r0, r1)); +} +static void +sqrtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSQRT_D(r0, r1)); +} +static void +negr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FNEG_S(r0, r1)); +} +static void +negr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FNEG_D(r0, r1)); +} +static void +absr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FABS_S(r0, r1)); +} + +static void +absr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FABS_D(r0, r1)); +} + + +/* + * Load operations + */ +static void +ldr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FLW(r0, r1, 0)); +} +static void +ldr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FLD(r0, r1, 0)); +} +static void +ldi_f(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_f(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_f(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} +static void +ldxi_f(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _FLW(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_f(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} +static void +ldi_d(jit_state_t *_jit, int32_t r0, jit_word_t i0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + ldr_d(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r1, r2); + ldr_d(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); +} + +static void +ldxi_d(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) +{ + if (simm12_p(i0)) + em_wp(_jit, _FLD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r1, i0); + ldr_d(_jit, r0, jit_gpr_regno(t0)); + unget_temp_gpr(_jit); + } +} + + + +/* + * Store operations + */ +static void +str_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSW(r0, r1, 0)); +} +static void +str_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FSD(r0, r1, 0)); +} +static void +sti_f(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_f(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} +static void +stxr_f(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_f(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} +static void +stxi_f(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _FSW(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_f(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} +static void +sti_d(jit_state_t *_jit, jit_word_t i0, int32_t r0) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(t0), i0); + str_d(_jit, jit_gpr_regno(t0), r0); + unget_temp_gpr(_jit); +} +static void +stxr_d(jit_state_t *_jit, int32_t r0, int32_t r1, int32_t r2) +{ + jit_gpr_t t0 = get_temp_gpr(_jit); + addr(_jit, jit_gpr_regno(t0), r0, r1); + str_d(_jit, jit_gpr_regno(t0), r2); + unget_temp_gpr(_jit); +} +static void +stxi_d(jit_state_t *_jit, jit_word_t i0, int32_t r0, int32_t r1) +{ + if (simm12_p(i0)) + em_wp(_jit, _FSD(r0, r1, i0)); + else { + jit_gpr_t t0 = get_temp_gpr(_jit); + addi(_jit, jit_gpr_regno(t0), r0, i0); + str_d(_jit, jit_gpr_regno(t0), r1); + unget_temp_gpr(_jit); + } +} + + +/* + * Transfer operations + */ +static void +movr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + if (r0 != r1) + em_wp(_jit, _FMV_S(r0, r1)); +} + +static void +movr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + if (r0 != r1) + em_wp(_jit, _FMV_D(r0, r1)); +} +static void +truncr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_W_S(r0, r1, 1)); +} +static void +truncr_d_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_W_D(r0, r1, 1)); +} +static void +truncr_f_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_L_S(r0, r1, 1)); +} +static void +truncr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_L_D(r0, r1, 1)); +} + +static void +extr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ +#if __WORDSIZE == 64 + em_wp(_jit, _FCVT_S_L(r0, r1, 0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _FCVT_S_W(r0, r1, 0)); +#endif +} +static void +extr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ +#if __WORDSIZE == 64 + em_wp(_jit, _FCVT_D_L(r0, r1, 0)); +#elif __WORDSIZE == 32 + em_wp(_jit, _FCVT_D_W(r0, r1, 0)); +#endif +} + +static void +extr_f_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_D_S(r0, r1, 0)); +} +static void +extr_d_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FCVT_S_D(r0, r1, 0)); +} + +static void +movi_f(jit_state_t *_jit, int32_t r0, jit_float32_t i0) +{ + union { int32_t i; jit_float32_t f; } u = { .f = i0 }; + jit_gpr_t reg = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(reg), u.i); + em_wp(_jit, _FMV_W_X(r0, jit_gpr_regno(reg))); + unget_temp_gpr(_jit); +} +static void +movi_d(jit_state_t *_jit, int32_t r0, jit_float64_t i0) +{ + // TODO: How to move a 64 bit value from a 32 bit X register? + // ATM only works on RV64 + union { int64_t i; jit_float64_t f; } u = { .f = i0 }; + jit_gpr_t reg = get_temp_gpr(_jit); + movi(_jit, jit_gpr_regno(reg), u.i); + em_wp(_jit, _FMV_D_X(r0, jit_gpr_regno(reg))); + unget_temp_gpr(_jit); +} + + +/* + * Argument management + */ +static void +retval_f(jit_state_t *_jit, int32_t r0) +{ + movr_f(_jit, jit_fpr_regno(_FA0), r0); +} + +static void +retval_d(jit_state_t *_jit, int32_t r0) +{ + movr_d(_jit, jit_fpr_regno(_FA0), r0); +} + +static void +retr_f(jit_state_t *_jit, int32_t u) +{ + movr_f(_jit, jit_fpr_regno(_FA0), u); + ret(_jit); +} + +static void +retr_d(jit_state_t *_jit, int32_t u) +{ + movr_d(_jit, jit_fpr_regno(_FA0), u); + ret(_jit); +} + + +/* + * Branch instructions + */ + +static jit_reloc_t +bltr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_S(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_S(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +beqr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_S(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bler_f(_jit, r1, r0); +} + +static jit_reloc_t +bgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr_f(_jit, r1, r0); +} + +static jit_reloc_t +bner_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_S(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunltr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_S(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunler_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_S(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +buneqr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_S(t0, r0, r1)); + em_wp(_jit, _FLT_S(t1, r1, r0)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunger_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_S(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bungtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_S(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltgtr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_S(t0, r1, r0)); + em_wp(_jit, _FLT_S(t1, r0, r1)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bordr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_S(t0, r0, r0)); + em_wp(_jit, _FEQ_S(t1, r1, r1)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunordr_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_S(t0, r1, r1)); + em_wp(_jit, _FEQ_S(t1, r0, r0)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_D(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bler_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_D(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +beqr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_D(t0, r0, r1)); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bger_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bler_d(_jit, r1, r0); +} + +static jit_reloc_t +bgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + return bltr_d(_jit, r1, r0); +} + +static jit_reloc_t +bner_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FEQ_D(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunltr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_D(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunler_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_D(t0, r1, r0)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +buneqr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_D(t0, r0, r1)); + em_wp(_jit, _FLT_D(t1, r1, r0)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunger_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLT_D(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bungtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + jit_gpr_t tmp1 = get_temp_gpr(_jit); + int32_t t0 = jit_gpr_regno(tmp1); + + em_wp(_jit, _FLE_D(t0, r0, r1)); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bltgtr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FLT_D(t0, r1, r0)); + em_wp(_jit, _FLT_D(t1, r0, r1)); + orr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bordr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_D(t0, r0, r0)); + em_wp(_jit, _FEQ_D(t1, r1, r1)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = bner(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} + +static jit_reloc_t +bunordr_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + int32_t t0 = jit_gpr_regno(get_temp_gpr(_jit)); + int32_t t1 = jit_gpr_regno(get_temp_gpr(_jit)); + + em_wp(_jit, _FEQ_D(t0, r1, r1)); + em_wp(_jit, _FEQ_D(t1, r0, r0)); + andr(_jit, t0, t0, t1); + jit_reloc_t ret = beqr(_jit, t0, jit_gpr_regno(_ZERO)); + + unget_temp_gpr(_jit); + return ret; +} diff --git a/lightening/riscv.c b/lightening/riscv.c new file mode 100644 index 000000000..eaac94a96 --- /dev/null +++ b/lightening/riscv.c @@ -0,0 +1,327 @@ +/* + * Copyright (C) 2021-2024 Free Software Foundation, Inc. + * + * This file is part of GNU lightning. + * + * GNU lightning is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU lightning is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * Authors: + * Ekaitz Zarraga + */ + +#include "riscv-cpu.c" +#include "riscv-fpu.c" + +static const jit_gpr_t abi_gpr_args[] = { + _A0, _A1, _A2, _A3, _A4, _A5, _A6, _A7 +}; +static const jit_fpr_t abi_fpr_args[] = { + _FA0, _FA1, _FA2, _FA3, _FA4, _FA5, _FA6, _FA7 +}; +static const int abi_gpr_arg_count = sizeof(abi_gpr_args) / sizeof(abi_gpr_args[0]); +static const int abi_fpr_arg_count = sizeof(abi_fpr_args) / sizeof(abi_fpr_args[0]); + +struct abi_arg_iterator +{ + const jit_operand_t *args; + size_t argc; + + size_t arg_idx; + size_t gpr_idx; + size_t fpr_idx; + uint32_t vfp_used_registers; + size_t stack_size; + size_t stack_padding; +}; + +static size_t page_size; + +jit_bool_t +jit_get_cpu(void) +{ + page_size = sysconf(_SC_PAGE_SIZE); + // FIXME check version, extensions, hardware fp support + // + // List of macro definitions for riscv support: + // ------------------------------------------- + // __riscv: defined for any RISC-V target. Older versions of the GCC + // toolchain defined __riscv__. + // + // __riscv_xlen: 32 for RV32 and 64 for RV64. + // + // __riscv_float_abi_soft, __riscv_float_abi_single, + // __riscv_float_abi_double: one of these three will be defined, depending on + // target ABI. + // + // __riscv_cmodel_medlow, __riscv_cmodel_medany: one of these two will be + // defined, depending on the target code model. + // + // __riscv_mul: defined when targeting the 'M' ISA extension. + // + // __riscv_muldiv: defined when targeting the 'M' ISA extension and -mno-div + // has not been used. + // + // __riscv_div: defined when targeting the 'M' ISA extension and -mno-div has + // not been used. + // + // __riscv_atomic: defined when targeting the 'A' ISA extension. + // + // __riscv_flen: 32 when targeting the 'F' ISA extension (but not 'D') and 64 + // when targeting 'FD'. + // + // __riscv_fdiv: defined when targeting the 'F' or 'D' ISA extensions and + // -mno-fdiv has not been used. + // + // __riscv_fsqrt: defined when targeting the 'F' or 'D' ISA extensions and + // -mno-fdiv has not been used. + // + // __riscv_compressed: defined when targeting the 'C' ISA extension. + return 1; +} + +jit_bool_t +jit_init(jit_state_t *_jit) +{ + return 1; +} + +static size_t +jit_initial_frame_size (void) +{ + return 0; +} + +static void +reset_abi_arg_iterator(struct abi_arg_iterator *iter, size_t argc, + const jit_operand_t *args) +{ + memset(iter, 0, sizeof *iter); + iter->argc = argc; + iter->args = args; +} + +static void +next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg) +{ + ASSERT(iter->arg_idx < iter->argc); + enum jit_operand_abi abi = iter->args[iter->arg_idx].abi; + iter->arg_idx++; + if (is_gpr_arg(abi) && iter->gpr_idx < abi_gpr_arg_count) { + *arg = jit_operand_gpr (abi, abi_gpr_args[iter->gpr_idx++]); + return; + } + if (is_fpr_arg(abi) && iter->fpr_idx < abi_fpr_arg_count) { + *arg = jit_operand_fpr (abi, abi_fpr_args[iter->fpr_idx++]); + return; + } + *arg = jit_operand_mem (abi, JIT_SP, iter->stack_size); +#if __WORDSIZE == 32 + iter->stack_size += 4; +#elif __WORDSIZE == 64 + iter->stack_size += 8; +#endif +} + +static void +jit_flush(void *fptr, void *tptr) +{ + jit_word_t f = (jit_word_t)fptr & -page_size; + jit_word_t t = (((jit_word_t)tptr) + page_size - 1) & -page_size; + __clear_cache((void *)f, (void *)t); +} + +static inline size_t +jit_stack_alignment(void) +{ + return 8; + // NOTE: See: https://github.com/riscv/riscv-gcc/issues/61 +} + +static void +jit_try_shorten(jit_state_t *_jit, jit_reloc_t reloc, jit_pointer_t addr) +{ +} + +static void* +bless_function_pointer(void *ptr) +{ + return ptr; +} + + +/* + * Veneers + */ +struct veneer{ + instr_t auipc; + instr_t load; // `ld` in RV64 and `lw` in RV32 + instr_t jalr; +#if __WORDSIZE == 64 + uint64_t address; +#elif __WORDSIZE == 32 + uint32_t address; +#endif +}; + +static void +emit_veneer(jit_state_t *_jit, jit_pointer_t target) +{ + // We need to generate something like this (RV64): + // ---------------------------------------------- + // auipc t0, 0 + // ld t0, 12(t0) + // jalr zero, 0(t0) + // ADDRESS_LITERAL + jit_gpr_t t0 = get_temp_gpr(_jit); + emit_u32(_jit, _AUIPC(jit_gpr_regno(t0), 0)); +#if __WORDSIZE == 64 + emit_u32(_jit, _LD(jit_gpr_regno(t0), jit_gpr_regno(t0), 12)); +#elif __WORDSIZE == 32 + emit_u32(_jit, _LW(jit_gpr_regno(t0), jit_gpr_regno(t0), 12)); +#endif + emit_u32(_jit, _JALR(jit_gpr_regno(_ZERO), jit_gpr_regno(t0), 0)); +#if __WORDSIZE == 64 + emit_u64(_jit, (uint64_t) target); +#elif __WORDSIZE == 32 + emit_u32(_jit, (uint32_t) target); +#endif + unget_temp_gpr(_jit); +} + +static void +patch_veneer(uint32_t *loc, jit_pointer_t addr) +{ + struct veneer *v = (struct veneer*) loc; +#if __WORDSIZE == 64 + v->address = (uint64_t) addr; +#elif __WORDSIZE == 32 + v->address = (uint32_t) addr; +#endif +} + + +/* + * Conditional jumps + */ +static void +patch_jcc_offset(uint32_t *loc, ptrdiff_t v) +{ + + instr_t *i = (instr_t *) loc; + i->B.imm11 = (v >> 11) & 0x1; + i->B.imm4_1 = (v >> 1) & 0xf; + i->B.imm10_5 = (v >> 5) & 0x3f; + i->B.imm12 = (v >> 12) & 0x1; +} +static void +patch_veneer_jcc_offset(uint32_t *loc, ptrdiff_t offset){ + patch_jcc_offset(loc, offset); +} + +static int32_t +read_jcc_offset(uint32_t *loc) +{ + instr_t i; + i.w = *loc; + + int32_t offset = i.B.imm12 << 31; + offset >>= 20; + offset |= (i.B.imm11 << 11); + offset |= (i.B.imm10_5 << 5); + offset |= (i.B.imm4_1 << 1); + + return offset; +} +static int +offset_in_jcc_range(ptrdiff_t offset, int flags) +{ + if(offset & 1) + return 0; + else + return -0x1000 <= offset && offset <= 0xFFF; +} + +/* + * Unconditional jumps + */ +static int32_t read_jmp_offset(uint32_t *loc) +{ + instr_t i; + i.w = *loc; + + int32_t offset = i.J.imm20 << 31; + offset >>= 12; + offset |= (i.J.imm19_12 << 12); + offset |= (i.J.imm11 << 11); + offset |= (i.J.imm10_1 << 1); + return offset; +} +static int +offset_in_jmp_range(ptrdiff_t offset, int flags) +{ + if(offset & 1) + return 0; + else + return -0x100000 <= offset && offset <= 0xFFFFF; +} + +static void +patch_jmp_offset(uint32_t *loc, ptrdiff_t v) +{ + instr_t *i = (instr_t *) loc; + i->J.imm20 = (v >> 20) & 0x1; + i->J.imm19_12= (v >> 12) & 0xff; + i->J.imm11 = (v >> 11) & 0x1; + i->J.imm10_1 = (v >> 1) & 0x3ff; +} + +static void +patch_veneer_jmp_offset(uint32_t *loc, ptrdiff_t offset) +{ + patch_jmp_offset(loc, offset); +} + + +/* + * Jumps around the veneer + */ +static void +patch_jmp_without_veneer(jit_state_t *_jit, uint32_t *loc) +{ + patch_jmp_offset(loc, _jit->pc.ui - loc); +} +static uint32_t* +jmp_without_veneer(jit_state_t *_jit) +{ + uint32_t *loc = _jit->pc.ui; + emit_u32(_jit, _JAL(jit_gpr_regno(_ZERO), 0)); + return loc; +} + + +/* + * Load from pool offset + */ +static void +patch_load_from_pool_offset(uint32_t *loc, int32_t v) +{ + load_from_pool_t *i = (load_from_pool_t *) loc; + int32_t hi20 = v >>12; + i->inst.auipc.U.imm31_12 = hi20; + i->inst.load.I.imm11_0 = v - (hi20<<12); +} +static int32_t +read_load_from_pool_offset(uint32_t *loc) +{ + load_from_pool_t *i = (load_from_pool_t*) loc; + return i->inst.auipc.U.imm31_12 + i->inst.load.I.imm11_0; +} + diff --git a/lightening/riscv.h b/lightening/riscv.h new file mode 100644 index 000000000..173216655 --- /dev/null +++ b/lightening/riscv.h @@ -0,0 +1,194 @@ +/* + * Copyright (C) 2021-2024 Free Software Foundation, Inc. + * + * This file is part of GNU lightning. + * + * GNU lightning is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 3, or (at your option) + * any later version. + * + * GNU lightning is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + * License for more details. + * + * Authors: + * Ekaitz Zarraga + */ + +#ifndef _jit_riscv_h +#define _jit_riscv_h + +#define JIT_NEEDS_LITERAL_POOL 1 + +// x registers +// Special registers +#define _RA JIT_GPR(1) // Return address +#define _SP JIT_GPR(2) // Stack pointer +#define _GP JIT_GPR(3) // Global pointer +#define _TP JIT_GPR(4) // Thread pointer +#define _FP JIT_GPR(8) // Frame pointer +#define _ZERO JIT_GPR(0) // Always zero +// Argument passing +#define _A0 JIT_GPR(10) +#define _A1 JIT_GPR(11) +#define _A2 JIT_GPR(12) +#define _A3 JIT_GPR(13) +#define _A4 JIT_GPR(14) +#define _A5 JIT_GPR(15) +#define _A6 JIT_GPR(16) +#define _A7 JIT_GPR(17) +// Saved registers +#define _S0 _FP // S0 is the frame pointer normally +#define _S1 JIT_GPR(9) +#define _S2 JIT_GPR(18) +#define _S3 JIT_GPR(19) +#define _S4 JIT_GPR(20) +#define _S5 JIT_GPR(21) +#define _S6 JIT_GPR(22) +#define _S7 JIT_GPR(23) +#define _S8 JIT_GPR(24) +#define _S9 JIT_GPR(25) +#define _S10 JIT_GPR(26) +#define _S11 JIT_GPR(27) +// Temporaries +#define _T0 JIT_GPR(5) +#define _T1 JIT_GPR(6) +#define _T2 JIT_GPR(7) +#define _T3 JIT_GPR(28) +#define _T4 JIT_GPR(29) +#define _T5 JIT_GPR(30) +#define _T6 JIT_GPR(31) + +// f registers +// Termporaries +#define _FT0 JIT_FPR(0) +#define _FT1 JIT_FPR(1) +#define _FT2 JIT_FPR(2) +#define _FT3 JIT_FPR(3) +#define _FT4 JIT_FPR(4) +#define _FT5 JIT_FPR(5) +#define _FT6 JIT_FPR(6) +#define _FT7 JIT_FPR(7) +#define _FT8 JIT_FPR(28) +#define _FT9 JIT_FPR(29) +#define _FT10 JIT_FPR(30) +#define _FT11 JIT_FPR(31) +// Saved registers +#define _FS0 JIT_FPR(8) +#define _FS1 JIT_FPR(9) +#define _FS2 JIT_FPR(18) +#define _FS3 JIT_FPR(19) +#define _FS4 JIT_FPR(20) +#define _FS5 JIT_FPR(21) +#define _FS6 JIT_FPR(22) +#define _FS7 JIT_FPR(23) +#define _FS8 JIT_FPR(24) +#define _FS9 JIT_FPR(25) +#define _FS10 JIT_FPR(26) +#define _FS11 JIT_FPR(27) +// Argument passing +#define _FA0 JIT_FPR(10) +#define _FA1 JIT_FPR(11) +#define _FA2 JIT_FPR(12) +#define _FA3 JIT_FPR(13) +#define _FA4 JIT_FPR(14) +#define _FA5 JIT_FPR(15) +#define _FA6 JIT_FPR(16) +#define _FA7 JIT_FPR(17) + + +// JIT Registers +// ---------------------------------------------------------------------- +// Caller-save registers JIT_R${NUM} +// Callee-save registers JIT_V${NUM} +// Caller-save temporary registers JIT_TMP${NUM} +// Caller-save floating point registers JIT_F${NUM} +// Callee-save floating point registers JIT_VF${NUM} +// Caller-save floating point temporary registers JIT_FTMP${NUM} + +// Caller-save registers +#define JIT_R0 _A0 +#define JIT_R1 _A1 +#define JIT_R2 _A2 +#define JIT_R3 _A3 +#define JIT_R4 _A4 +#define JIT_R5 _A5 +#define JIT_R6 _A6 +#define JIT_R7 _A7 + +// Use this as a CARRY +#define JIT_CARRY _T0 +#define JIT_TMP0 _T1 +#define JIT_TMP1 _T2 +#define JIT_TMP2 _T3 + +#define JIT_TMP3 _T4 +// Temporaries +#define JIT_TMP4 _T5 +#define JIT_TMP5 _T6 + +// Callee-save registers +#define JIT_V0 _S1 +#define JIT_V1 _S2 +#define JIT_V2 _S3 +#define JIT_V3 _S4 +#define JIT_V4 _S5 +#define JIT_V5 _S6 +#define JIT_V6 _S7 +#define JIT_V7 _S8 +#define JIT_V8 _S9 +#define JIT_V9 _S10 +#define JIT_V10 _S11 + + +// Callee-save floating point registers +#define JIT_VF0 _FS0 +#define JIT_VF1 _FS1 +#define JIT_VF2 _FS2 +#define JIT_VF3 _FS3 +#define JIT_VF4 _FS4 +#define JIT_VF5 _FS5 +#define JIT_VF6 _FS6 +#define JIT_VF7 _FS7 +#define JIT_VF8 _FS8 +#define JIT_VF9 _FS9 +#define JIT_VF10 _FS10 +#define JIT_VF11 _FS11 + +// Caller save floating point registers +#define JIT_F0 _FA0 +#define JIT_F1 _FA1 +#define JIT_F2 _FA2 +#define JIT_F3 _FA3 +#define JIT_F4 _FA4 +#define JIT_F5 _FA5 +#define JIT_F6 _FA6 +#define JIT_F7 _FA7 +// NOTE: These are temporaries, but we can use them as general purpose +// registers as there's only one temporary JIT_FTMP supported by lightening.c +#define JIT_F8 _FT0 +#define JIT_F9 _FT1 +#define JIT_F10 _FT2 +#define JIT_F11 _FT3 +#define JIT_F12 _FT4 +#define JIT_F13 _FT5 +#define JIT_F14 _FT6 +#define JIT_F15 _FT7 +#define JIT_F16 _FT8 +#define JIT_F17 _FT9 +#define JIT_F18 _FT10 + +// Floating point temporary register +#define JIT_FTMP _FT11 + +// Special purpose registers +#define JIT_FP _FP +#define JIT_LR _RA +#define JIT_SP _SP + +// TODO: Make sure this is correct +#define JIT_PLATFORM_CALLEE_SAVE_GPRS JIT_LR + +#endif From 797fe5067c41869070b339f0a653f1af3a759335 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Wed, 19 Jan 2022 11:09:20 +0100 Subject: [PATCH 06/23] Add RISCV to CI and makefile --- .gitlab-ci.yml | 8 ++++++++ tests/Makefile | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a5e8694bf..449fdcd2d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,6 +4,7 @@ before_script: - dpkg --add-architecture i386 - dpkg --add-architecture arm64 - dpkg --add-architecture armhf + - dpkg --add-architecture riscv64 - apt-get update -qq - apt-get install -y libc6-dev:amd64 gcc make @@ -11,8 +12,10 @@ before_script: gcc-i686-linux-gnu libc6-dev-i386-cross libc6:i386 gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6:arm64 gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6:armhf + gcc-riscv64-linux-gnu libc6-dev-riscv64-cross libc6:riscv64 - update-binfmts --enable qemu-aarch64 - update-binfmts --enable qemu-arm + - update-binfmts --enable qemu-riscv64 x86-64: stage: test @@ -38,3 +41,8 @@ armhf-thumb: stage: test script: - make -C tests test-armv7 CC_ARMv7="arm-linux-gnueabihf-gcc -mthumb" + +riscv: + stage: test + script: + - make -C tests test-riscv CC_RISCV="riscv64-linux-gnu-gcc" diff --git a/tests/Makefile b/tests/Makefile index 5c44bceae..271f3e403 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -14,7 +14,8 @@ TARGETS ?= native ia32 aarch64 armv7 riscv # libc6-dev:amd64 gcc make \ # qemu binfmt-support qemu-user-static \ # gcc-i686-linux-gnu libc6-dev-i386-cross libc6:i386 \ -# gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6:arm64 +# gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6:arm64\ +# gcc-riscv64-linux-gnu libc6-dev-riscv64-cross libc6:riscv64 # CC = gcc CC_IA32 ?= guix environment --pure -s i686-linux --ad-hoc gcc-toolchain -- gcc From 3edd48b046e0cfc2a3b846875989882bdd496670 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Wed, 19 Jan 2022 12:20:42 +0100 Subject: [PATCH 07/23] Fix CI --- .gitlab-ci.yml | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 449fdcd2d..4e4b40ff6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,48 +1,60 @@ image: debian:stable before_script: - - dpkg --add-architecture i386 - - dpkg --add-architecture arm64 - - dpkg --add-architecture armhf - - dpkg --add-architecture riscv64 - apt-get update -qq - - apt-get install -y - libc6-dev:amd64 gcc make - binfmt-support qemu-user-static - gcc-i686-linux-gnu libc6-dev-i386-cross libc6:i386 - gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6:arm64 - gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6:armhf - gcc-riscv64-linux-gnu libc6-dev-riscv64-cross libc6:riscv64 - - update-binfmts --enable qemu-aarch64 - - update-binfmts --enable qemu-arm - - update-binfmts --enable qemu-riscv64 + - apt-get install -y make binfmt-support qemu-user-static x86-64: stage: test script: + - dpkg --add-architecture arm64 + - apt-get update -qq + - apt-get install -y libc6-dev:amd64 gcc - make -C tests test-native i686: stage: test script: + - dpkg --add-architecture i386 + - apt-get update -qq + - apt-get install -y gcc-i686-linux-gnu libc6-dev-i386-cross libc6:i386 - make -C tests test-ia32 CC_IA32=i686-linux-gnu-gcc aarch64: stage: test script: + - dpkg --add-architecture arm64 + - apt-get update -qq + - apt-get install -y gcc-aarch64-linux-gnu libc6-dev-arm64-cross libc6:arm64 - make -C tests test-aarch64 CC_AARCH64=aarch64-linux-gnu-gcc armhf: stage: test script: + - dpkg --add-architecture armhf + - apt-get update -qq + - apt-get install -y gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6:armhf - make -C tests test-armv7 CC_ARMv7="arm-linux-gnueabihf-gcc -marm" + armhf-thumb: stage: test script: + - dpkg --add-architecture armhf + - apt-get update -qq + - apt-get install -y gcc-arm-linux-gnueabihf libc6-dev-armhf-cross libc6:armhf - make -C tests test-armv7 CC_ARMv7="arm-linux-gnueabihf-gcc -mthumb" + riscv: stage: test script: - - make -C tests test-riscv CC_RISCV="riscv64-linux-gnu-gcc" + - dpkg --add-architecture riscv64 + - apt-get update -qq + - apt-get install -y gcc-riscv64-linux-gnu + - echo /usr/local/lib/riscv64-linux-gnu >>/etc/ld.so.conf.d/riscv64-linux-gnu.conf + - echo /lib/riscv64-linux-gnu >>/etc/ld.so.conf.d/riscv64-linux-gnu.conf + - echo /usr/lib/riscv64-linux-gnu >>/etc/ld.so.conf.d/riscv64-linux-gnu.conf + - echo /usr/riscv64-linux-gnu/lib >>/etc/ld.so.conf.d/riscv64-linux-gnu.conf + - ln -s /usr/riscv64-linux-gnu/lib/ld-linux-riscv64-lp64d.so.1 /lib + - make -C tests test-riscv CC_RISCV="riscv64-linux-gnu-gcc -static" From 775d11b21ef2e065bac0cdc64b684f1d927612ba Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Tue, 8 Oct 2024 17:00:27 +0200 Subject: [PATCH 08/23] riscv: Add fence --- lightening/riscv-cpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c index d9d36f30b..73bede8da 100644 --- a/lightening/riscv-cpu.c +++ b/lightening/riscv-cpu.c @@ -2461,6 +2461,7 @@ static void mfence(jit_state_t *_jit) { // TODO: we may need it for atomic operations? + em_wp(_jit, _FENCE(0xFF)); } static void From 76549a674ac4e4d234b8c3c0254185d79ee5c317 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Thu, 7 Nov 2024 19:51:37 +0100 Subject: [PATCH 09/23] riscv: Pack the veneer struct --- lightening/riscv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightening/riscv.c b/lightening/riscv.c index eaac94a96..553916748 100644 --- a/lightening/riscv.c +++ b/lightening/riscv.c @@ -160,7 +160,7 @@ bless_function_pointer(void *ptr) /* * Veneers */ -struct veneer{ +struct __attribute__((packed)) veneer{ instr_t auipc; instr_t load; // `ld` in RV64 and `lw` in RV32 instr_t jalr; From f6f2a757c327bd605d937fd875ae5856b48f4f59 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Thu, 14 Nov 2024 13:01:26 +0100 Subject: [PATCH 10/23] riscv: don't pack veneers, use padding --- lightening/riscv.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/lightening/riscv.c b/lightening/riscv.c index 553916748..43122ab9b 100644 --- a/lightening/riscv.c +++ b/lightening/riscv.c @@ -160,11 +160,12 @@ bless_function_pointer(void *ptr) /* * Veneers */ -struct __attribute__((packed)) veneer{ +struct veneer{ instr_t auipc; instr_t load; // `ld` in RV64 and `lw` in RV32 instr_t jalr; #if __WORDSIZE == 64 + uint32_t padding; uint64_t address; #elif __WORDSIZE == 32 uint32_t address; @@ -174,21 +175,25 @@ struct __attribute__((packed)) veneer{ static void emit_veneer(jit_state_t *_jit, jit_pointer_t target) { - // We need to generate something like this (RV64): + // We need to generate something like this: // ---------------------------------------------- - // auipc t0, 0 - // ld t0, 12(t0) - // jalr zero, 0(t0) - // ADDRESS_LITERAL + // 32 bits: | 64 bits: + // auipc t0, 0 | auipc t0, 0 + // ld t0, 12(t0) | ld t0, 16(t0) + // jalr zero, 0(t0) | jalr zero, 0(t0) + // ADDRESS_LITERAL | .byte 0x00, 0x00, 0x00, 0x00 (padding) + // | ADDRESS_LITERAL + // jit_gpr_t t0 = get_temp_gpr(_jit); emit_u32(_jit, _AUIPC(jit_gpr_regno(t0), 0)); #if __WORDSIZE == 64 - emit_u32(_jit, _LD(jit_gpr_regno(t0), jit_gpr_regno(t0), 12)); + emit_u32(_jit, _LD(jit_gpr_regno(t0), jit_gpr_regno(t0), 16)); #elif __WORDSIZE == 32 emit_u32(_jit, _LW(jit_gpr_regno(t0), jit_gpr_regno(t0), 12)); #endif emit_u32(_jit, _JALR(jit_gpr_regno(_ZERO), jit_gpr_regno(t0), 0)); #if __WORDSIZE == 64 + emit_u32(_jit, 0); // Padding emit_u64(_jit, (uint64_t) target); #elif __WORDSIZE == 32 emit_u32(_jit, (uint32_t) target); From 93380fc37777258c7758d027fed1420bbcc780c4 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Thu, 14 Nov 2024 13:24:48 +0100 Subject: [PATCH 11/23] riscv: clean patch jumps --- lightening/riscv-cpu.c | 20 ++++++++------------ lightening/riscv.c | 15 ++++----------- 2 files changed, 12 insertions(+), 23 deletions(-) diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c index 73bede8da..1026466ea 100644 --- a/lightening/riscv-cpu.c +++ b/lightening/riscv-cpu.c @@ -205,7 +205,9 @@ Btype(int32_t op, int32_t fct, int32_t rs1, int32_t rs2, int32_t imm) assert(!(fct & ~0x07)); assert(!(rs1 & ~0x1f)); assert(!(rs2 & ~0x1f)); - assert(!(imm & 1) && simm12_p(imm)); + assert(!(imm & 1)); + assert(simm12_p(imm >> 1)); + i.B.opcode = op; i.B.imm11 = (imm >> 11) & 0x1; i.B.imm4_1 = (imm >> 1) & 0xf; @@ -236,7 +238,9 @@ Jtype(int32_t op, int32_t rd, int32_t imm) instr_t i; assert(!(op & ~0x7f)); assert(!(rd & ~0x1f)); - assert(!(imm & 1) && imm <= 1048575 && imm >= -1048576); + assert(!(imm & 1)); + assert(simm20_p(imm >> 1)); + i.J.opcode = op; i.J.rd = rd; i.J.imm19_12= (imm >> 12) & 0xff; @@ -1102,11 +1106,7 @@ static uint32_t patch_cc_jump(uint32_t inst, int32_t offset){ instr_t i; i.w = inst; - i.B.imm11 = (offset >> 11) & 0x1; - i.B.imm4_1 = (offset >> 1) & 0xf; - i.B.imm10_5 = (offset >> 5) & 0x3f; - i.B.imm12 = (offset >> 12) & 0x1; - return i.w; + return Btype(i.B.opcode, i.B.funct3, i.B.rs1, i.B.rs2, offset); } static jit_reloc_t @@ -2250,11 +2250,7 @@ patch_jump(uint32_t inst, int32_t offset) { instr_t i; i.w = inst; - i.J.imm20 = (offset >> 20) & 0x1; - i.J.imm19_12= (offset >> 12) & 0xff; - i.J.imm11 = (offset >> 11) & 0x1; - i.J.imm10_1 = (offset >> 1) & 0x3ff; - return i.w; + return Jtype(i.J.opcode, i.J.rd, offset); } static jit_reloc_t emit_jump(jit_state_t *_jit, uint32_t inst) diff --git a/lightening/riscv.c b/lightening/riscv.c index 43122ab9b..9d5462af6 100644 --- a/lightening/riscv.c +++ b/lightening/riscv.c @@ -219,12 +219,8 @@ patch_veneer(uint32_t *loc, jit_pointer_t addr) static void patch_jcc_offset(uint32_t *loc, ptrdiff_t v) { - instr_t *i = (instr_t *) loc; - i->B.imm11 = (v >> 11) & 0x1; - i->B.imm4_1 = (v >> 1) & 0xf; - i->B.imm10_5 = (v >> 5) & 0x3f; - i->B.imm12 = (v >> 12) & 0x1; + i->w = patch_cc_jump(i->w, v); } static void patch_veneer_jcc_offset(uint32_t *loc, ptrdiff_t offset){ @@ -251,7 +247,7 @@ offset_in_jcc_range(ptrdiff_t offset, int flags) if(offset & 1) return 0; else - return -0x1000 <= offset && offset <= 0xFFF; + return simm12_p(offset); } /* @@ -275,17 +271,14 @@ offset_in_jmp_range(ptrdiff_t offset, int flags) if(offset & 1) return 0; else - return -0x100000 <= offset && offset <= 0xFFFFF; + return simm20_p(offset); } static void patch_jmp_offset(uint32_t *loc, ptrdiff_t v) { instr_t *i = (instr_t *) loc; - i->J.imm20 = (v >> 20) & 0x1; - i->J.imm19_12= (v >> 12) & 0xff; - i->J.imm11 = (v >> 11) & 0x1; - i->J.imm10_1 = (v >> 1) & 0x3ff; + i->w = patch_jump(i->w, v); } static void From c6008fd0ab6ce8a3acf24c67790b70b6751a884d Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Thu, 14 Nov 2024 16:11:46 +0100 Subject: [PATCH 12/23] riscv: fix the B and J type size check --- lightening/riscv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightening/riscv.c b/lightening/riscv.c index 9d5462af6..dc74cce7d 100644 --- a/lightening/riscv.c +++ b/lightening/riscv.c @@ -247,7 +247,7 @@ offset_in_jcc_range(ptrdiff_t offset, int flags) if(offset & 1) return 0; else - return simm12_p(offset); + return simm12_p(offset >> 1); } /* @@ -271,7 +271,7 @@ offset_in_jmp_range(ptrdiff_t offset, int flags) if(offset & 1) return 0; else - return simm20_p(offset); + return simm20_p(offset >> 1); } static void From fb527804f998858d0a9c75c540dd06844c356aef Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Thu, 14 Nov 2024 16:57:13 +0100 Subject: [PATCH 13/23] riscv: add get_callr_temp --- lightening/riscv.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lightening/riscv.c b/lightening/riscv.c index dc74cce7d..a62e48553 100644 --- a/lightening/riscv.c +++ b/lightening/riscv.c @@ -156,6 +156,11 @@ bless_function_pointer(void *ptr) return ptr; } +static jit_gpr_t +get_callr_temp (jit_state_t * _jit) +{ + return _RA; +} /* * Veneers From 8c7990d4a14ce232c13faf38e87c888babe05ebd Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Thu, 14 Nov 2024 19:35:43 +0100 Subject: [PATCH 14/23] riscv: fix literal pool guard jump address calc --- lightening/riscv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightening/riscv.c b/lightening/riscv.c index a62e48553..ab58b3ee1 100644 --- a/lightening/riscv.c +++ b/lightening/riscv.c @@ -299,7 +299,7 @@ patch_veneer_jmp_offset(uint32_t *loc, ptrdiff_t offset) static void patch_jmp_without_veneer(jit_state_t *_jit, uint32_t *loc) { - patch_jmp_offset(loc, _jit->pc.ui - loc); + patch_jmp_offset(loc, _jit->pc.uw - (uintptr_t)loc); } static uint32_t* jmp_without_veneer(jit_state_t *_jit) From 33eddc7b62672b53999dee9f469b1b36ef36915e Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Thu, 14 Nov 2024 23:35:54 +0100 Subject: [PATCH 15/23] riscv: simplify load from pool --- lightening/riscv-cpu.c | 2 +- lightening/riscv.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c index 1026466ea..97c0ae49c 100644 --- a/lightening/riscv-cpu.c +++ b/lightening/riscv-cpu.c @@ -454,7 +454,7 @@ static void comr(jit_state_t *_jit, int32_t r0, int32_t r1); static void movr(jit_state_t *_jit, int32_t r0, int32_t r1); static void movi(jit_state_t *_jit, int32_t r0, jit_word_t i0); -static uint64_t patch_load_from_pool(uint64_t instrs, uint32_t off); +static uint64_t patch_load_from_pool(uint64_t instrs, int32_t off); static jit_reloc_t emit_load_from_pool(jit_state_t *_jit, uint64_t insts); static jit_reloc_t mov_addr(jit_state_t *_jit, int32_t r0); static jit_reloc_t movi_from_pool(jit_state_t *_jit, int32_t r0); diff --git a/lightening/riscv.c b/lightening/riscv.c index ab58b3ee1..c4b43dc02 100644 --- a/lightening/riscv.c +++ b/lightening/riscv.c @@ -317,9 +317,7 @@ static void patch_load_from_pool_offset(uint32_t *loc, int32_t v) { load_from_pool_t *i = (load_from_pool_t *) loc; - int32_t hi20 = v >>12; - i->inst.auipc.U.imm31_12 = hi20; - i->inst.load.I.imm11_0 = v - (hi20<<12); + i->l = patch_load_from_pool(i->l, v); } static int32_t read_load_from_pool_offset(uint32_t *loc) From 741af987a3dfcec430eeffde826269c977b97ac4 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Thu, 14 Nov 2024 11:43:25 +0100 Subject: [PATCH 16/23] riscv: fix load size for ldxi instructions --- lightening/riscv-cpu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c index 97c0ae49c..191a26e68 100644 --- a/lightening/riscv-cpu.c +++ b/lightening/riscv-cpu.c @@ -2067,7 +2067,7 @@ static void ldxi_c(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) { if (simm12_p(i0)) - em_wp(_jit, _LD(r0, r1, i0)); + em_wp(_jit, _LB(r0, r1, i0)); else { jit_gpr_t t0 = get_temp_gpr(_jit); addi(_jit, jit_gpr_regno(t0), r1, i0); @@ -2079,7 +2079,7 @@ static void ldxi_uc(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) { if (simm12_p(i0)) - em_wp(_jit, _LD(r0, r1, i0)); + em_wp(_jit, _LBU(r0, r1, i0)); else { jit_gpr_t t0 = get_temp_gpr(_jit); addi(_jit, jit_gpr_regno(t0), r1, i0); @@ -2091,7 +2091,7 @@ static void ldxi_us(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) { if (simm12_p(i0)) - em_wp(_jit, _LD(r0, r1, i0)); + em_wp(_jit, _LHU(r0, r1, i0)); else { jit_gpr_t t0 = get_temp_gpr(_jit); addi(_jit, jit_gpr_regno(t0), r1, i0); @@ -2103,7 +2103,7 @@ static void ldxi_s(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) { if (simm12_p(i0)) - em_wp(_jit, _LD(r0, r1, i0)); + em_wp(_jit, _LH(r0, r1, i0)); else { jit_gpr_t t0 = get_temp_gpr(_jit); addi(_jit, jit_gpr_regno(t0), r1, i0); @@ -2115,7 +2115,7 @@ static void ldxi_i(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) { if (simm12_p(i0)) - em_wp(_jit, _LD(r0, r1, i0)); + em_wp(_jit, _LW(r0, r1, i0)); else { jit_gpr_t t0 = get_temp_gpr(_jit); addi(_jit, jit_gpr_regno(t0), r1, i0); @@ -2128,7 +2128,7 @@ static void ldxi_ui(jit_state_t *_jit, int32_t r0, int32_t r1, jit_word_t i0) { if (simm12_p(i0)) - em_wp(_jit, _LD(r0, r1, i0)); + em_wp(_jit, _LWU(r0, r1, i0)); else { jit_gpr_t t0 = get_temp_gpr(_jit); addi(_jit, jit_gpr_regno(t0), r1, i0); From 019cd024105f1238d6a71ddc3248ccc7573908c5 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Fri, 15 Nov 2024 12:01:23 +0100 Subject: [PATCH 17/23] riscv: movi: sign extend hi --- lightening/riscv-cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c index 191a26e68..bd2fff593 100644 --- a/lightening/riscv-cpu.c +++ b/lightening/riscv-cpu.c @@ -1576,7 +1576,7 @@ movi(jit_state_t *_jit, int32_t r0, jit_word_t i0) int32_t srcreg = jit_gpr_regno(_ZERO); if (simm32_p(i0)){ - int64_t hi = ((i0 + 0x800) >> 12) & 0xFFFFF; + int64_t hi = (((i0 + 0x800) >> 12) & 0xFFFFF) << 44 >> 44; int64_t lo = (int32_t)i0<<20>>20; if(hi){ From 746660bf08ca99e2f25ad2715632e4c451832abe Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Fri, 15 Nov 2024 12:01:52 +0100 Subject: [PATCH 18/23] riscv: movi: use addiw in RV64 --- lightening/riscv-cpu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c index bd2fff593..92ee7cf1f 100644 --- a/lightening/riscv-cpu.c +++ b/lightening/riscv-cpu.c @@ -1585,7 +1585,11 @@ movi(jit_state_t *_jit, int32_t r0, jit_word_t i0) } if(lo || hi == 0){ +#if __WORDSIZE == 64 + em_wp(_jit, _ADDIW(r0, srcreg, lo)); +#elif __WORDSIZE == 32 em_wp(_jit, _ADDI(r0, srcreg, lo)); +#endif } } else { From cbda249dc563343c516e8c7287475a0bdf80386f Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Fri, 15 Nov 2024 13:09:45 +0100 Subject: [PATCH 19/23] riscv: better `movi` --- lightening/riscv-cpu.c | 44 ++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c index 92ee7cf1f..6009870a6 100644 --- a/lightening/riscv-cpu.c +++ b/lightening/riscv-cpu.c @@ -1570,6 +1570,20 @@ movr(jit_state_t *_jit, int32_t r0, int32_t r1) em_wp(_jit, _MV(r0, r1)); } + +static int +count_trailing_zeros(uint64_t x) +{ + if(x == 0) + return 64; + int count = 0; + while((x & 0x1) == 0){ + x >>= 1; + count++; + } + return count; +} + static void movi(jit_state_t *_jit, int32_t r0, jit_word_t i0) { @@ -1594,27 +1608,15 @@ movi(jit_state_t *_jit, int32_t r0, jit_word_t i0) } else { // 64 bits: load in various steps - // lui, addi, slli, addi, slli, addi, slli, addi - int64_t hh = (i0>>44); - int64_t hl = (i0>>33) - (hh<<11); - int64_t lh = (i0>>22) - ((hh<<22) + (hl<<11)); - int64_t lm = (i0>>11) - ((hh<<33) + (hl<<22) + (lh<<11)); - int64_t ll = i0 - ((hh<<44) + (hl<<33) + (lh<<22) + (lm<<11)); - - - em_wp(_jit, _LUI(r0, hh)); - em_wp(_jit, _SLLI(r0, r0, 32)); - em_wp(_jit, _SRLI(r0, r0, 33)); - em_wp(_jit, _ADDI(r0, r0, hl)); - - em_wp(_jit, _SLLI(r0, r0, 11)); - em_wp(_jit, _ADDI(r0, r0, lh)); - - em_wp(_jit, _SLLI(r0, r0, 11)); - em_wp(_jit, _ADDI(r0, r0, lm)); - - em_wp(_jit, _SLLI(r0, r0, 11)); - em_wp(_jit, _ADDI(r0, r0, ll)); + int64_t lo12 = i0 << 52 >> 52; + int64_t hi52 = (i0 + 0x800) >> 12; + int shift_amount = 12 + count_trailing_zeros((uint64_t) hi52); + hi52 = (hi52 >> (shift_amount - 12)) << shift_amount >> shift_amount; + movi(_jit, r0, hi52); // Recurse + em_wp(_jit, _SLLI(r0, r0, shift_amount)); + if (lo12) { + em_wp(_jit, _ADDI(r0, r0, lo12)); + } } } From 285cfd284ab6d72a1aea060845c65b1688dcd77a Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Fri, 15 Nov 2024 13:49:21 +0100 Subject: [PATCH 20/23] riscv: fix hi20/lo12 calculations for negative numbers --- lightening/riscv-cpu.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c index 6009870a6..38fa58a0b 100644 --- a/lightening/riscv-cpu.c +++ b/lightening/riscv-cpu.c @@ -1588,10 +1588,10 @@ static void movi(jit_state_t *_jit, int32_t r0, jit_word_t i0) { int32_t srcreg = jit_gpr_regno(_ZERO); - if (simm32_p(i0)){ - int64_t hi = (((i0 + 0x800) >> 12) & 0xFFFFF) << 44 >> 44; - int64_t lo = (int32_t)i0<<20>>20; + if (simm32_p(i0)){ + int32_t hi = (int32_t)(((i0 + 0x800) >> 12) & 0xFFFFF) << 12 >> 12; + int32_t lo = (int32_t)i0<<20>>20; if(hi){ em_wp(_jit, _LUI(r0, hi)); @@ -1632,14 +1632,15 @@ static uint64_t patch_load_from_pool(uint64_t instrs, int32_t off){ load_from_pool_t out, in; - int32_t hi20 = off >>12; + int32_t hi = (int32_t)(((off + 0x800) >> 12) & 0xFFFFF) << 12 >> 12; + int32_t lo = (int32_t)off<<20>>20; in.l = instrs; - out.inst.auipc.w = _AUIPC(in.inst.auipc.U.rd, hi20); + out.inst.auipc.w = _AUIPC(in.inst.auipc.U.rd, hi); out.inst.load.w = Itype(in.inst.load.I.opcode, // `ld` or `lw` in.inst.load.I.rd, in.inst.load.I.funct3, in.inst.load.I.rs1, - off - (hi20<<12)); + lo); return out.l; } From ce8b8e4778e7e757855d4397626228ae0f1979f3 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Fri, 15 Nov 2024 21:51:39 +0100 Subject: [PATCH 21/23] riscv: change stack alignment to 16 --- lightening/riscv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lightening/riscv.c b/lightening/riscv.c index c4b43dc02..3f0adce46 100644 --- a/lightening/riscv.c +++ b/lightening/riscv.c @@ -124,7 +124,7 @@ next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg) } *arg = jit_operand_mem (abi, JIT_SP, iter->stack_size); #if __WORDSIZE == 32 - iter->stack_size += 4; + iter->stack_size += 4 + (abi == JIT_OPERAND_ABI_DOUBLE ? 4 : 0); #elif __WORDSIZE == 64 iter->stack_size += 8; #endif @@ -141,7 +141,7 @@ jit_flush(void *fptr, void *tptr) static inline size_t jit_stack_alignment(void) { - return 8; + return 16; // NOTE: See: https://github.com/riscv/riscv-gcc/issues/61 } From 7c20ba77672df6a5d6f8ffc6aab00b5693087f92 Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Mon, 18 Nov 2024 21:08:41 +0100 Subject: [PATCH 22/23] riscv: float/double call convention implementation RISC-V uses a0-a7 registers for argument passing. Float/double arguments use f0-f7 first and continue in a0-a7 if needed. Once registers are consumed, stack is used. This commit changes how lightening passes arguments in order to allow this behavior. --- lightening/lightening.c | 27 ++++++++++++++++++++++++++- lightening/riscv-fpu.c | 25 +++++++++++++++++++++++++ lightening/riscv.c | 13 +++++++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/lightening/lightening.c b/lightening/lightening.c index c66b3a132..f26a467ff 100644 --- a/lightening/lightening.c +++ b/lightening/lightening.c @@ -807,6 +807,14 @@ abi_mem_to_gpr(jit_state_t *_jit, enum jit_operand_abi abi, case JIT_OPERAND_ABI_INT16: jit_ldxi_s(_jit, dst, base, offset); break; + case JIT_OPERAND_ABI_FLOAT: + { + jit_fpr_t tmp = get_temp_fpr(_jit); + jit_ldxi_f(_jit, tmp, base, offset); + jit_movr_i_f(_jit, dst, tmp); + unget_temp_fpr(_jit); + break; + } #if __WORDSIZE == 32 case JIT_OPERAND_ABI_UINT32: case JIT_OPERAND_ABI_POINTER: @@ -823,6 +831,14 @@ abi_mem_to_gpr(jit_state_t *_jit, enum jit_operand_abi abi, case JIT_OPERAND_ABI_INT64: jit_ldxi_l(_jit, dst, base, offset); break; + case JIT_OPERAND_ABI_DOUBLE: + { + jit_fpr_t tmp = get_temp_fpr(_jit); + jit_ldxi_d(_jit, tmp, base, offset); + jit_movr_l_d(_jit, dst, tmp); + unget_temp_fpr(_jit); + break; + } #endif default: abort(); @@ -887,7 +903,8 @@ enum move_kind { MOVE_KIND_ENUM(IMM, MEM), MOVE_KIND_ENUM(GPR, MEM), MOVE_KIND_ENUM(FPR, MEM), - MOVE_KIND_ENUM(MEM, MEM) + MOVE_KIND_ENUM(MEM, MEM), + MOVE_KIND_ENUM(FPR, GPR) }; #undef MOVE_KIND_ENUM @@ -901,6 +918,14 @@ move_operand(jit_state_t *_jit, jit_operand_t dst, jit_operand_t src) case MOVE_GPR_TO_GPR: return jit_movr(_jit, dst.loc.gpr.gpr, src.loc.gpr.gpr); + case MOVE_FPR_TO_GPR: +#if __WORDSIZE > 32 + if (src.abi == JIT_OPERAND_ABI_DOUBLE) + return jit_movr_l_d(_jit, dst.loc.gpr.gpr, src.loc.fpr); + else +#endif + return jit_movr_i_f(_jit, dst.loc.gpr.gpr, src.loc.fpr); + case MOVE_MEM_TO_GPR: return abi_mem_to_gpr(_jit, src.abi, dst.loc.gpr.gpr, src.loc.mem.base, src.loc.mem.offset); diff --git a/lightening/riscv-fpu.c b/lightening/riscv-fpu.c index 315ed8d14..b4e7546c7 100644 --- a/lightening/riscv-fpu.c +++ b/lightening/riscv-fpu.c @@ -103,6 +103,10 @@ static void absr_d(jit_state_t *_jit, int32_t r0, int32_t r1); // Transfer operations static void movr_f(jit_state_t *_jit, int32_t r0, int32_t r1); static void movr_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movr_i_f(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movr_l_d(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1); +static void movr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1); // Argument management static void retr_f(jit_state_t *_jit, int32_t u); @@ -398,6 +402,27 @@ movr_d(jit_state_t *_jit, int32_t r0, int32_t r1) if (r0 != r1) em_wp(_jit, _FMV_D(r0, r1)); } +static void +movr_i_f(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FMV_X_W(r0, r1)); +} +static void +movr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FMV_W_X(r0, r1)); +} +static void +movr_l_d(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FMV_X_D(r0, r1)); +} +static void +movr_d_l(jit_state_t *_jit, int32_t r0, int32_t r1) +{ + em_wp(_jit, _FMV_D_X(r0, r1)); +} + static void truncr_f_i(jit_state_t *_jit, int32_t r0, int32_t r1) { diff --git a/lightening/riscv.c b/lightening/riscv.c index 3f0adce46..d3e4efaa3 100644 --- a/lightening/riscv.c +++ b/lightening/riscv.c @@ -111,6 +111,16 @@ reset_abi_arg_iterator(struct abi_arg_iterator *iter, size_t argc, static void next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg) { + // RISC-V Calling convention: + // https://riscv.org/wp-content/uploads/2015/01/riscv-calling.pdf + // + // The RISC-V calling convention passes arguments in registers when possible. + // Up to eight integer registers, a0–a7, and up to eight floating-point + // registers, fa0–fa7, are used for this purpose. + // + // If argument i < 8 is a floating-point type, it is passed in floating-point + // register fai; otherwise, it is passed in integer register ai. + ASSERT(iter->arg_idx < iter->argc); enum jit_operand_abi abi = iter->args[iter->arg_idx].abi; iter->arg_idx++; @@ -121,6 +131,9 @@ next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg) if (is_fpr_arg(abi) && iter->fpr_idx < abi_fpr_arg_count) { *arg = jit_operand_fpr (abi, abi_fpr_args[iter->fpr_idx++]); return; + } else if (is_fpr_arg(abi) && iter->gpr_idx < abi_gpr_arg_count) { + *arg = jit_operand_gpr (abi, abi_gpr_args[iter->gpr_idx++]); + return; } *arg = jit_operand_mem (abi, JIT_SP, iter->stack_size); #if __WORDSIZE == 32 From 105a9c79584d6fb9b086beb83d8738f09875f26a Mon Sep 17 00:00:00 2001 From: Ekaitz Zarraga Date: Mon, 18 Nov 2024 21:12:03 +0100 Subject: [PATCH 23/23] riscv: error if not little endian --- lightening/riscv-cpu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lightening/riscv-cpu.c b/lightening/riscv-cpu.c index 38fa58a0b..101f7395b 100644 --- a/lightening/riscv-cpu.c +++ b/lightening/riscv-cpu.c @@ -17,6 +17,11 @@ * Paulo Cesar Pereira de Andrade * Ekaitz Zarraga */ + +#if __BYTE_ORDER != __LITTLE_ENDIAN +#error RISC-V requires little-endian host +#endif + #define stack_framesize (200 + 64) #define simm6_p(im) ((im) <= 31 && (im) >= -32) #define simm12_p(im) ((im) <= 2047 && (im) >= -2048)