mirror of
https://git.savannah.gnu.org/git/guile.git
synced 2025-05-28 16:00:22 +02:00
The problem with callr is that the register that contains the function to be called, can be overwritten by the logic that moves the values into argument registers. To fix this, I added a get_callr_temp function that should return a platform specific register that is not used to pass arguments. For Aarch64/Arm the link registers seems to work; for Amd64/i686 the RAX register. The function/tmp pair becomes an additional argument to the parallel assigment; this way the original function register is not accidentally overwritten. The problem with calli is that it may not have enough temp registers to move arguments. The windmill paper says that at most one temporary register is needed for the parallel assignment. However, we also need a temp register for mem-to-mem moves. So it seems that we need a second temporary. For Amd64/i686 we have only one temporary GPR and one temporary FPR. To fix this, I modified the algorithm from the paper a bit: we perform the mem-to-mem moves before the other moves. Later when we need the temp to break cycles, there shouldn't be any mem-to-mem moves left. So we should never need two temps at the same time. * lightening/lightening.c: (get_callr_temp): New function; need for each platform. (prepare_call_args): Include the function/callr_temp pair in the arguments for the parallel assignment. * lightening/x86.c, lightening/arm.c, lightening/aarch64.c (get_callr_temp): Implementation for each platform. * lightening/arm.c (next_abi_arg): Fix the stack size for doubles. * tests/call_10_2.c, tests/callr_10.c: New tests. * tests/regarrays.inc: New file. Common code between the above two tests that would be tedious to duplicate.
413 lines
11 KiB
C
413 lines
11 KiB
C
/*
|
|
* Copyright (C) 2012-2020 Free Software Foundation, Inc.
|
|
*
|
|
* This file is part of GNU lightning.
|
|
*
|
|
* GNU lightning is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU Lesser General Public License as published
|
|
* by the Free Software Foundation; either version 3, or (at your option)
|
|
* any later version.
|
|
*
|
|
* GNU lightning is distributed in the hope that it will be useful, but
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
|
|
* License for more details.
|
|
*
|
|
* Authors:
|
|
* Paulo Cesar Pereira de Andrade
|
|
*/
|
|
|
|
#define _NOREG 0xffff
|
|
|
|
typedef struct {
|
|
/* x87 present */
|
|
uint32_t fpu : 1;
|
|
/* cmpxchg8b instruction */
|
|
uint32_t cmpxchg8b : 1;
|
|
/* cmov and fcmov branchless conditional mov */
|
|
uint32_t cmov : 1;
|
|
/* mmx registers/instructions available */
|
|
uint32_t mmx : 1;
|
|
/* sse registers/instructions available */
|
|
uint32_t sse : 1;
|
|
/* sse2 registers/instructions available */
|
|
uint32_t sse2 : 1;
|
|
/* sse3 instructions available */
|
|
uint32_t sse3 : 1;
|
|
/* pcmulqdq instruction */
|
|
uint32_t pclmulqdq : 1;
|
|
/* ssse3 suplemental sse3 instructions available */
|
|
uint32_t ssse3 : 1;
|
|
/* fused multiply/add using ymm state */
|
|
uint32_t fma : 1;
|
|
/* cmpxchg16b instruction */
|
|
uint32_t cmpxchg16b : 1;
|
|
/* sse4.1 instructions available */
|
|
uint32_t sse4_1 : 1;
|
|
/* sse4.2 instructions available */
|
|
uint32_t sse4_2 : 1;
|
|
/* movbe instruction available */
|
|
uint32_t movbe : 1;
|
|
/* popcnt instruction available */
|
|
uint32_t popcnt : 1;
|
|
/* aes instructions available */
|
|
uint32_t aes : 1;
|
|
/* avx instructions available */
|
|
uint32_t avx : 1;
|
|
/* lahf/sahf available in 64 bits mode */
|
|
uint32_t lahf : 1;
|
|
} jit_cpu_t;
|
|
|
|
static jit_cpu_t jit_cpu;
|
|
|
|
static inline jit_reloc_t
|
|
emit_rel8_reloc (jit_state_t *_jit, uint8_t inst_start)
|
|
{
|
|
uint8_t *loc = _jit->pc.uc;
|
|
emit_u8 (_jit, 0);
|
|
return jit_reloc(_jit, JIT_RELOC_REL8, inst_start, loc, _jit->pc.uc, 0);
|
|
}
|
|
|
|
static inline jit_reloc_t
|
|
emit_rel32_reloc (jit_state_t *_jit, uint8_t inst_start)
|
|
{
|
|
uint8_t *loc = _jit->pc.uc;
|
|
emit_u32 (_jit, 0);
|
|
return jit_reloc(_jit, JIT_RELOC_REL32, inst_start, loc, _jit->pc.uc, 0);
|
|
}
|
|
|
|
#include "x86-cpu.c"
|
|
#include "x86-sse.c"
|
|
|
|
jit_bool_t
|
|
jit_get_cpu(void)
|
|
{
|
|
union {
|
|
struct {
|
|
uint32_t sse3 : 1;
|
|
uint32_t pclmulqdq : 1;
|
|
uint32_t dtes64 : 1; /* amd reserved */
|
|
uint32_t monitor : 1;
|
|
uint32_t ds_cpl : 1; /* amd reserved */
|
|
uint32_t vmx : 1; /* amd reserved */
|
|
uint32_t smx : 1; /* amd reserved */
|
|
uint32_t est : 1; /* amd reserved */
|
|
uint32_t tm2 : 1; /* amd reserved */
|
|
uint32_t ssse3 : 1;
|
|
uint32_t cntx_id : 1; /* amd reserved */
|
|
uint32_t __reserved0 : 1;
|
|
uint32_t fma : 1;
|
|
uint32_t cmpxchg16b : 1;
|
|
uint32_t xtpr : 1; /* amd reserved */
|
|
uint32_t pdcm : 1; /* amd reserved */
|
|
uint32_t __reserved1 : 1;
|
|
uint32_t pcid : 1; /* amd reserved */
|
|
uint32_t dca : 1; /* amd reserved */
|
|
uint32_t sse4_1 : 1;
|
|
uint32_t sse4_2 : 1;
|
|
uint32_t x2apic : 1; /* amd reserved */
|
|
uint32_t movbe : 1; /* amd reserved */
|
|
uint32_t popcnt : 1;
|
|
uint32_t tsc : 1; /* amd reserved */
|
|
uint32_t aes : 1;
|
|
uint32_t xsave : 1;
|
|
uint32_t osxsave : 1;
|
|
uint32_t avx : 1;
|
|
uint32_t __reserved2 : 1; /* amd F16C */
|
|
uint32_t __reserved3 : 1;
|
|
uint32_t __alwayszero : 1; /* amd RAZ */
|
|
} bits;
|
|
jit_uword_t cpuid;
|
|
} ecx;
|
|
union {
|
|
struct {
|
|
uint32_t fpu : 1;
|
|
uint32_t vme : 1;
|
|
uint32_t de : 1;
|
|
uint32_t pse : 1;
|
|
uint32_t tsc : 1;
|
|
uint32_t msr : 1;
|
|
uint32_t pae : 1;
|
|
uint32_t mce : 1;
|
|
uint32_t cmpxchg8b : 1;
|
|
uint32_t apic : 1;
|
|
uint32_t __reserved0 : 1;
|
|
uint32_t sep : 1;
|
|
uint32_t mtrr : 1;
|
|
uint32_t pge : 1;
|
|
uint32_t mca : 1;
|
|
uint32_t cmov : 1;
|
|
uint32_t pat : 1;
|
|
uint32_t pse36 : 1;
|
|
uint32_t psn : 1; /* amd reserved */
|
|
uint32_t clfsh : 1;
|
|
uint32_t __reserved1 : 1;
|
|
uint32_t ds : 1; /* amd reserved */
|
|
uint32_t acpi : 1; /* amd reserved */
|
|
uint32_t mmx : 1;
|
|
uint32_t fxsr : 1;
|
|
uint32_t sse : 1;
|
|
uint32_t sse2 : 1;
|
|
uint32_t ss : 1; /* amd reserved */
|
|
uint32_t htt : 1;
|
|
uint32_t tm : 1; /* amd reserved */
|
|
uint32_t __reserved2 : 1;
|
|
uint32_t pbe : 1; /* amd reserved */
|
|
} bits;
|
|
jit_uword_t cpuid;
|
|
} edx;
|
|
#if __X32
|
|
int ac, flags;
|
|
#endif
|
|
jit_uword_t eax, ebx;
|
|
|
|
#if __X32
|
|
/* adapted from glibc __sysconf */
|
|
__asm__ volatile ("pushfl;\n\t"
|
|
"popl %0;\n\t"
|
|
"movl $0x240000, %1;\n\t"
|
|
"xorl %0, %1;\n\t"
|
|
"pushl %1;\n\t"
|
|
"popfl;\n\t"
|
|
"pushfl;\n\t"
|
|
"popl %1;\n\t"
|
|
"xorl %0, %1;\n\t"
|
|
"pushl %0;\n\t"
|
|
"popfl"
|
|
: "=r" (flags), "=r" (ac));
|
|
|
|
/* i386 or i486 without cpuid */
|
|
if ((ac & (1 << 21)) == 0)
|
|
/* probably without x87 as well */
|
|
return 0;
|
|
#endif
|
|
|
|
/* query %eax = 1 function */
|
|
__asm__ volatile (
|
|
#if __X32 || __X64_32
|
|
"xchgl %%ebx, %1; cpuid; xchgl %%ebx, %1"
|
|
#else
|
|
"xchgq %%rbx, %1; cpuid; xchgq %%rbx, %1"
|
|
#endif
|
|
: "=a" (eax), "=r" (ebx),
|
|
"=c" (ecx.cpuid), "=d" (edx.cpuid)
|
|
: "0" (1));
|
|
|
|
jit_cpu.fpu = edx.bits.fpu;
|
|
jit_cpu.cmpxchg8b = edx.bits.cmpxchg8b;
|
|
jit_cpu.cmov = edx.bits.cmov;
|
|
jit_cpu.mmx = edx.bits.mmx;
|
|
jit_cpu.sse = edx.bits.sse;
|
|
jit_cpu.sse2 = edx.bits.sse2;
|
|
jit_cpu.sse3 = ecx.bits.sse3;
|
|
jit_cpu.pclmulqdq = ecx.bits.pclmulqdq;
|
|
jit_cpu.ssse3 = ecx.bits.ssse3;
|
|
jit_cpu.fma = ecx.bits.fma;
|
|
jit_cpu.cmpxchg16b = ecx.bits.cmpxchg16b;
|
|
jit_cpu.sse4_1 = ecx.bits.sse4_1;
|
|
jit_cpu.sse4_2 = ecx.bits.sse4_2;
|
|
jit_cpu.movbe = ecx.bits.movbe;
|
|
jit_cpu.popcnt = ecx.bits.popcnt;
|
|
jit_cpu.aes = ecx.bits.aes;
|
|
jit_cpu.avx = ecx.bits.avx;
|
|
|
|
/* query %eax = 0x80000001 function */
|
|
#if __X64
|
|
__asm__ volatile (
|
|
# if __X64_32
|
|
"xchgl %%ebx, %1; cpuid; xchgl %%ebx, %1"
|
|
# else
|
|
"xchgq %%rbx, %1; cpuid; xchgq %%rbx, %1"
|
|
# endif
|
|
: "=a" (eax), "=r" (ebx),
|
|
"=c" (ecx.cpuid), "=d" (edx.cpuid)
|
|
: "0" (0x80000001));
|
|
jit_cpu.lahf = ecx.cpuid & 1;
|
|
#endif
|
|
|
|
return jit_cpu.sse2;
|
|
}
|
|
|
|
jit_bool_t
|
|
jit_init(jit_state_t *_jit)
|
|
{
|
|
return jit_cpu.sse2;
|
|
}
|
|
|
|
static const jit_gpr_t abi_gpr_args[] = {
|
|
#if __X32
|
|
/* No GPRs in args. */
|
|
#elif __CYGWIN__
|
|
_RCX, _RDX, _R8, _R9
|
|
#else
|
|
_RDI, _RSI, _RDX, _RCX, _R8, _R9
|
|
#endif
|
|
};
|
|
|
|
static const jit_fpr_t abi_fpr_args[] = {
|
|
#if __X32
|
|
/* No FPRs in args. */
|
|
#elif __CYGWIN__
|
|
_XMM0, _XMM1, _XMM2, _XMM3
|
|
#else
|
|
_XMM0, _XMM1, _XMM2, _XMM3, _XMM4, _XMM5, _XMM6, _XMM7
|
|
#endif
|
|
};
|
|
|
|
static const int abi_gpr_arg_count = sizeof(abi_gpr_args) / sizeof(abi_gpr_args[0]);
|
|
static const int abi_fpr_arg_count = sizeof(abi_fpr_args) / sizeof(abi_fpr_args[0]);
|
|
|
|
struct abi_arg_iterator
|
|
{
|
|
const jit_operand_t *args;
|
|
size_t argc;
|
|
|
|
size_t arg_idx;
|
|
size_t gpr_idx;
|
|
size_t fpr_idx;
|
|
size_t stack_size;
|
|
size_t stack_padding;
|
|
};
|
|
|
|
static size_t
|
|
jit_operand_abi_sizeof(enum jit_operand_abi abi)
|
|
{
|
|
switch (abi) {
|
|
case JIT_OPERAND_ABI_UINT8:
|
|
case JIT_OPERAND_ABI_INT8:
|
|
return 1;
|
|
case JIT_OPERAND_ABI_UINT16:
|
|
case JIT_OPERAND_ABI_INT16:
|
|
return 2;
|
|
case JIT_OPERAND_ABI_UINT32:
|
|
case JIT_OPERAND_ABI_INT32:
|
|
return 4;
|
|
case JIT_OPERAND_ABI_UINT64:
|
|
case JIT_OPERAND_ABI_INT64:
|
|
return 8;
|
|
case JIT_OPERAND_ABI_POINTER:
|
|
return CHOOSE_32_64(4, 8);
|
|
case JIT_OPERAND_ABI_FLOAT:
|
|
return 4;
|
|
case JIT_OPERAND_ABI_DOUBLE:
|
|
return 8;
|
|
default:
|
|
abort();
|
|
}
|
|
}
|
|
|
|
static size_t
|
|
round_size_up_to_words(size_t bytes)
|
|
{
|
|
size_t word_size = CHOOSE_32_64(4, 8);
|
|
size_t words = (bytes + word_size - 1) / word_size;
|
|
return words * word_size;
|
|
}
|
|
|
|
static size_t
|
|
jit_initial_frame_size (void)
|
|
{
|
|
return __WORDSIZE / 8; // Saved return address is on stack.
|
|
}
|
|
|
|
static void
|
|
reset_abi_arg_iterator(struct abi_arg_iterator *iter, size_t argc,
|
|
const jit_operand_t *args)
|
|
{
|
|
memset(iter, 0, sizeof *iter);
|
|
iter->argc = argc;
|
|
iter->args = args;
|
|
#if __CYGWIN__ && __X64
|
|
// Reserve slots on the stack for 4 register parameters (8 bytes each).
|
|
iter->stack_size = 32;
|
|
#endif
|
|
}
|
|
|
|
static void
|
|
next_abi_arg(struct abi_arg_iterator *iter, jit_operand_t *arg)
|
|
{
|
|
ASSERT(iter->arg_idx < iter->argc);
|
|
enum jit_operand_abi abi = iter->args[iter->arg_idx].abi;
|
|
if (is_gpr_arg(abi) && iter->gpr_idx < abi_gpr_arg_count) {
|
|
*arg = jit_operand_gpr (abi, abi_gpr_args[iter->gpr_idx++]);
|
|
#ifdef __CYGWIN__
|
|
iter->fpr_idx++;
|
|
#endif
|
|
} else if (is_fpr_arg(abi) && iter->fpr_idx < abi_fpr_arg_count) {
|
|
*arg = jit_operand_fpr (abi, abi_fpr_args[iter->fpr_idx++]);
|
|
#ifdef __CYGWIN__
|
|
iter->gpr_idx++;
|
|
#endif
|
|
} else {
|
|
*arg = jit_operand_mem (abi, JIT_SP, iter->stack_size);
|
|
size_t bytes = jit_operand_abi_sizeof (abi);
|
|
iter->stack_size += round_size_up_to_words (bytes);
|
|
}
|
|
iter->arg_idx++;
|
|
}
|
|
|
|
static void
|
|
jit_flush(void *fptr, void *tptr)
|
|
{
|
|
}
|
|
|
|
static inline size_t
|
|
jit_stack_alignment(void)
|
|
{
|
|
return 16;
|
|
}
|
|
|
|
static void
|
|
jit_try_shorten(jit_state_t *_jit, jit_reloc_t reloc, jit_pointer_t addr)
|
|
{
|
|
uint8_t *loc = _jit->start + reloc.offset;
|
|
uint8_t *start = loc - reloc.inst_start_offset;
|
|
uint8_t *end = _jit->pc.uc;
|
|
jit_imm_t i0 = (jit_imm_t)addr;
|
|
|
|
if (loc == start)
|
|
return;
|
|
|
|
if (start < (uint8_t*)addr && (uint8_t*)addr <= end)
|
|
return;
|
|
|
|
switch (reloc.kind)
|
|
{
|
|
case JIT_RELOC_ABSOLUTE: {
|
|
_jit->pc.uc = start;
|
|
ASSERT((loc[-1] & ~7) == 0xb8); // MOVI
|
|
int32_t r0 = loc[-1] & 7;
|
|
if (start != loc - 1) {
|
|
ASSERT(start == loc - 2);
|
|
r0 |= (loc[-2] & 1) << 3;
|
|
}
|
|
return movi(_jit, r0, i0);
|
|
}
|
|
case JIT_RELOC_REL8:
|
|
ASSERT((loc[-1] & ~0xf) == 0x70 || loc[-1] == 0xeb); // JCCSI or JMPSI
|
|
/* Nothing useful to do. */
|
|
return;
|
|
case JIT_RELOC_REL32:
|
|
_jit->pc.uc = start;
|
|
if (start[0] == 0xe9) { // JMP
|
|
return jmpi(_jit, i0);
|
|
}
|
|
ASSERT(start[0] == 0x0f); // JCC
|
|
return jcci(_jit, start[1] & ~0x80, i0);
|
|
default:
|
|
/* We don't emit other kinds of reloc. */
|
|
abort ();
|
|
}
|
|
}
|
|
|
|
static void*
|
|
bless_function_pointer(void *ptr)
|
|
{
|
|
return ptr;
|
|
}
|
|
|
|
static jit_gpr_t
|
|
get_callr_temp (jit_state_t * _jit)
|
|
{
|
|
return _RAX;
|
|
}
|