diff --git a/ChangeLog b/ChangeLog index cba89a815..17d6b568f 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +2006-11-04 Paolo Bonzini + + * lightning/ppc/core.h: Implement jit_allocai, define JIT_FP to be R1. + * lightning/ppc/funcs.h: Store frame size into _jitl. Store R1 before + the STMW, so that the offset is unchanged when we patch the STMW. + * lightning/i386/core.h: Define JIT_FP to be EBP. + * lightning/i386/core-32.h: Implement jit_allocai, put LEAVE in the + epilog if jit_allocai was used. + * lightning/i386/core-64.h: Implement jit_allocai, put LEAVE in the + epilog if jit_allocai was used. + +2006-11-04 Ludovic Courtes + + * lightning/sparc/core.h: Implement jit_allocai. + * tests/allocai.c: New. + * tests/Makefile.am: Point to new tests. + 2006-11-03 Paolo Bonzini * lightning/ppc/core.h: Fix jit_bms using BNE rather than BGT. diff --git a/NEWS b/NEWS index ac0e383bc..9292f12e8 100644 --- a/NEWS +++ b/NEWS @@ -9,7 +9,6 @@ o Support for stack-allocated variables. Because of this, backends defining JIT_FP should now rename it to JIT_AP. JIT_FP is now a user-visible register used in ldxi/ldxr to access stack-allocated variables. - [a promise for now, not yet implemented!] --- diff --git a/lightning/i386/core-32.h b/lightning/i386/core-32.h index d68f8f618..805af0397 100644 --- a/lightning/i386/core-32.h +++ b/lightning/i386/core-32.h @@ -41,21 +41,43 @@ struct jit_local_state { int framesize; int argssize; + int alloca_offset; + int alloca_slack; }; #define jit_base_prolog() (PUSHLr(_EBP), MOVLrr(_ESP, _EBP), PUSHLr(_EBX), PUSHLr(_ESI), PUSHLr(_EDI)) -#define jit_prolog(n) (_jitl.framesize = 8, jit_base_prolog()) +#define jit_prolog(n) (_jitl.framesize = 8, _jitl.alloca_offset = -12, jit_base_prolog()) -/* The += allows for stack pollution */ +/* Used internally. SLACK is used by the Darwin ABI which keeps the stack + aligned to 16-bytes. */ + +#define jit_allocai_internal(amount, slack) \ + (((amount) < _jitl.alloca_slack \ + ? 0 \ + : (_jitl.alloca_slack += (amount) + (slack), \ + ((amount) + (slack) == sizeof (int) \ + ? PUSHLr(_EAX) \ + : SUBLir((amount) + (slack), _ESP)))), \ + _jitl.alloca_slack -= (amount), \ + _jitl.alloca_offset -= (amount)) + +/* The += in argssize allows for stack pollution */ #ifdef __APPLE__ - /* Stack must stay 16-byte aligned: */ +/* Stack must stay 16-byte aligned: */ # define jit_prepare_i(ni) (((ni & 0x3) \ ? SUBLir(4 * ((((ni) + 3) & ~(0x3)) - (ni)), JIT_SP) \ : (void)0), \ _jitl.argssize += (((ni) + 3) & ~(0x3))) + +#define jit_allocai(n) \ + jit_allocai_internal ((n), (_jitl.alloca_slack - (n)) & 15) + #else # define jit_prepare_i(ni) (_jitl.argssize += (ni)) + +#define jit_allocai(n) \ + jit_allocai_internal ((n), 0) #endif #define jit_pusharg_i(rs) PUSHLr(rs) @@ -74,7 +96,7 @@ struct jit_local_state { #define jit_patch_long_at(jump_pc,v) (*_PSL((jump_pc) - sizeof(long)) = _jit_SL((jit_insn *)(v) - (jump_pc))) #define jit_patch_at(jump_pc,v) jit_patch_long_at(jump_pc, v) -#define jit_ret() (POPLr(_EDI), POPLr(_ESI), POPLr(_EBX), POPLr(_EBP), RET_()) +#define jit_ret() (POPLr(_EDI), POPLr(_ESI), POPLr(_EBX), (_jitl.alloca_offset < -12 ? LEAVE_() : POPLr(_EBP)), RET_()) #endif /* __lightning_core_h */ diff --git a/lightning/i386/core-64.h b/lightning/i386/core-64.h index 7680d1825..420fbcffe 100644 --- a/lightning/i386/core-64.h +++ b/lightning/i386/core-64.h @@ -40,8 +40,23 @@ struct jit_local_state { int long_jumps; int nextarg_geti; int argssize; + int alloca_offset; + int alloca_slack; }; + +/* Keep the stack 16-byte aligned, the SSE hardware prefers it this way. */ +#define jit_allocai_internal(amount, slack) \ + (((amount) < _jitl.alloca_slack \ + ? 0 \ + : (_jitl.alloca_slack += (amount) + (slack), \ + SUBQir((amount) + (slack), _ESP))), \ + _jitl.alloca_slack -= (amount), \ + _jitl.alloca_offset -= (amount)) + +#define jit_allocai(n) \ + jit_allocai_internal ((n), (_jitl.alloca_slack - (n)) & 15) + /* 3-parameter operation */ #define jit_qopr_(d, s1, s2, op1d, op2d) \ ( (s2 == d) ? op1d : \ @@ -95,7 +110,7 @@ struct jit_local_state { #define jit_popr_l(rs) POPQr(rs) #define jit_base_prolog() (PUSHQr(_EBP), MOVQrr(_ESP, _EBP), PUSHQr(_EBX), PUSHQr(_R12), PUSHQr(_R13)) -#define jit_prolog(n) (_jitl.nextarg_geti = 0, jit_base_prolog()) +#define jit_prolog(n) (_jitl.nextarg_geti = 0, _jitl.alloca_offset = -24, jit_base_prolog()) /* Stack isn't used for arguments: */ #define jit_prepare_i(ni) (_jitl.argssize = 0) @@ -154,7 +169,7 @@ static int jit_arg_reg_order[] = { _EDI, _ESI, _EDX, _ECX }; #define jit_patch_long_at(jump_pc,v) (*_PSL((jump_pc) - sizeof(long)) = _jit_SL((jit_insn *)(v))) #define jit_patch_short_at(jump_pc,v) (*_PSI((jump_pc) - sizeof(int)) = _jit_SI((jit_insn *)(v) - (jump_pc))) #define jit_patch_at(jump_pc,v) (_jitl.long_jumps ? jit_patch_long_at((jump_pc)-3, v) : jit_patch_short_at(jump_pc, v)) -#define jit_ret() (POPQr(_R13), POPQr(_R12), POPQr(_EBX), POPQr(_EBP), RET_()) +#define jit_ret() (POPQr(_R13), POPQr(_R12), POPQr(_EBX), (_jitl.alloca_offset < -24 ? LEAVE_() : POPQr(_EBP)), RET_()) #define _jit_ldi_l(d, is) MOVQmr((is), 0, 0, 0, (d)) #define jit_ldr_l(d, rs) MOVQmr(0, (rs), 0, 0, (d)) diff --git a/lightning/i386/core-i386.h b/lightning/i386/core-i386.h index 234546772..0e3e97b59 100644 --- a/lightning/i386/core-i386.h +++ b/lightning/i386/core-i386.h @@ -35,6 +35,7 @@ #define __lightning_core_i386_h #define JIT_AP _EBP +#define JIT_FP _EBP #define JIT_SP _ESP #define JIT_RET _EAX diff --git a/lightning/ppc/core.h b/lightning/ppc/core.h index cea8022b5..962aa7bad 100644 --- a/lightning/ppc/core.h +++ b/lightning/ppc/core.h @@ -42,9 +42,27 @@ struct jit_local_state { int nextarg_geti; /* Next r20-r25 reg. to be read */ int nextarg_getd; /* The FP args are picked up from FPR1 -> FPR10 */ int nbArgs; /* Number of arguments for the prolog */ + + int frame_size, slack; + jit_insn *stwu; }; +/* Patch a `stwu' instruction (with immediate operand) so that it decreases + r1 by AMOUNT. AMOUNT should already be rounded so that %sp remains quadword + aligned. */ +#define jit_patch_stwu(amount) \ + (*(_jitl.stwu) &= ~_MASK (16), \ + *(_jitl.stwu) |= _s16 ((amount))) + +#define jit_allocai(n) \ + (_jitl.frame_size += (n), \ + ((n) <= _jitl.slack \ + ? 0 : jit_patch_stwu (-((_jitl.frame_size + 15) & ~15))), \ + _jitl.slack = ((_jitl.frame_size + 15) & ~15) - _jitl.frame_size, \ + _jitl.frame_size - (n)) + #define JIT_SP 1 +#define JIT_FP 1 #define JIT_RET 3 #define JIT_R_NUM 3 #define JIT_V_NUM 7 @@ -52,9 +70,6 @@ struct jit_local_state { #define JIT_V(i) (31-(i)) #define JIT_AUX JIT_V(JIT_V_NUM) /* for 32-bit operands & shift counts */ -#define jit_pfx_start() (_jit.jitl.trampolines) -#define jit_pfx_end() (_jit.jitl.free) - /* If possible, use the `small' instruction (rd, rs, imm) * else load imm into r26 and use the `big' instruction (rd, rs, r26) */ diff --git a/lightning/ppc/funcs.h b/lightning/ppc/funcs.h index 90d84d23f..22c277ecd 100644 --- a/lightning/ppc/funcs.h +++ b/lightning/ppc/funcs.h @@ -91,34 +91,23 @@ static void _jit_epilog(jit_state *jit) { int n = _jitl.nbArgs; - int frame_size, ofs; int first_saved_reg = JIT_AUX - n; int num_saved_regs = 32 - first_saved_reg; - - frame_size = 24 + 32 + num_saved_regs * 4; /* r24..r31 + args */ - frame_size += 15; /* the stack must be quad-word */ - frame_size &= ~15; /* aligned */ + int frame_size = (_jitl.frame_size + 15) & ~15; #ifdef __APPLE__ - LWZrm(0, frame_size + 8, 1); /* lwz r0, x+8(r1) (ret.addr.) */ + LWZrm(0, frame_size + 8, 1); /* lwz r0, x+8(r1) (ret.addr.) */ #else - LWZrm(0, frame_size + 4, 1); /* lwz r0, x+4(r1) (ret.addr.) */ + LWZrm(0, frame_size + 4, 1); /* lwz r0, x+4(r1) (ret.addr.) */ #endif MTLRr(0); /* mtspr LR, r0 */ - ofs = frame_size - num_saved_regs * 4; - LMWrm(first_saved_reg, ofs, 1); /* lmw rI, ofs(r1) */ + LMWrm(first_saved_reg, 24 + 32, 1); /* lmw rI, ofs(r1) */ ADDIrri(1, 1, frame_size); /* addi r1, r1, x */ BLR(); /* blr */ } /* Emit a prolog for a function. - Upon entrance to the trampoline: - - LR = address where the real code for the function lies - - R3-R8 = parameters - Upon finishing the trampoline: - - R0 = return address for the function - - R25-R20 = parameters (order is reversed, 1st argument is R25) The +32 in frame_size computation is to accound for the parameter area of a function frame. @@ -126,7 +115,7 @@ _jit_epilog(jit_state *jit) On PPC the frame must have space to host the arguments of any callee. However, as it currently stands, the argument to jit_trampoline (n) is the number of arguments of the caller we generate. Therefore, the - callee can overwrite a part of the stack (saved register area when it + callee can overwrite a part of the stack (saved register area) when it flushes its own parameter on the stack. The addition of a constant offset = 32 is enough to hold eight 4 bytes arguments. This is less than perfect but is a reasonable work around for now. @@ -134,8 +123,8 @@ _jit_epilog(jit_state *jit) static void _jit_prolog(jit_state *jit, int n) { - int frame_size; - int ofs, i; + int orig_frame_size, frame_size; + int i; int first_saved_reg = JIT_AUX - n; int num_saved_regs = 32 - first_saved_reg; @@ -143,20 +132,31 @@ _jit_prolog(jit_state *jit, int n) _jitl.nextarg_getd = 1; _jitl.nbArgs = n; - frame_size = 24 + 32 + num_saved_regs * 4; /* r27..r31 + args */ - frame_size += 15; /* the stack must be quad-word */ - frame_size &= ~15; /* aligned */ - MFLRr(0); - STWUrm(1, -frame_size, 1); /* stwu r1, -x(r1) */ - ofs = frame_size - num_saved_regs * 4; - STMWrm(first_saved_reg, ofs, 1); /* stmw rI, ofs(r1) */ #ifdef __APPLE__ - STWrm(0, frame_size + 8, 1); /* stw r0, x+8(r1) */ + STWrm(0, 8, 1); /* stw r0, 8(r1) */ #else - STWrm(0, frame_size + 4, 1); /* stw r0, x+4(r1) */ + STWrm(0, 4, 1); /* stw r0, 4(r1) */ #endif + + /* 0..55 -> frame data + 56..frame_size -> saved registers + + The STMW instruction is patched by jit_allocai, thus leaving + the space for the allocai above the 56 bytes. jit_allocai is + also able to reuse the slack space needed to keep the stack + quadword-aligned. */ + + _jitl.frame_size = 24 + 32 + num_saved_regs * 4; /* r27..r31 + args */ + + /* The stack must be quad-word aligned. */ + frame_size = (_jitl.frame_size + 15) & ~15; + _jitl.slack = frame_size - _jitl.frame_size; + _jitl.stwu = _jit.x.pc; + STWUrm(1, -frame_size, 1); /* stwu r1, -x(r1) */ + + STMWrm(first_saved_reg, 24 + 32, 1); /* stmw rI, ofs(r1) */ for (i = 0; i < n; i++) MRrr(JIT_AUX-1-i, 3+i); /* save parameters below r24 */ } diff --git a/tests/Makefile.in b/tests/Makefile.in index ae65e726c..96613a18a 100644 --- a/tests/Makefile.in +++ b/tests/Makefile.in @@ -41,7 +41,7 @@ check_PROGRAMS = fibit$(EXEEXT) incr$(EXEEXT) printf$(EXEEXT) \ printf2$(EXEEXT) rpn$(EXEEXT) fib$(EXEEXT) fibdelay$(EXEEXT) \ add$(EXEEXT) bp$(EXEEXT) testfp$(EXEEXT) funcfp$(EXEEXT) \ rpnfp$(EXEEXT) modi$(EXEEXT) ldxi$(EXEEXT) divi$(EXEEXT) \ - movi$(EXEEXT) ret$(EXEEXT) + movi$(EXEEXT) ret$(EXEEXT) allocai$(EXEEXT) subdir = tests DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 @@ -56,6 +56,11 @@ add_SOURCES = add.c add_OBJECTS = add.$(OBJEXT) add_LDADD = $(LDADD) @DISASS_TRUE@add_DEPENDENCIES = $(top_builddir)/opcode/libdisass.a +allocai_SOURCES = allocai.c +allocai_OBJECTS = allocai.$(OBJEXT) +allocai_LDADD = $(LDADD) +@DISASS_TRUE@allocai_DEPENDENCIES = \ +@DISASS_TRUE@ $(top_builddir)/opcode/libdisass.a bp_SOURCES = bp.c bp_OBJECTS = bp.$(OBJEXT) bp_LDADD = $(LDADD) @@ -129,12 +134,12 @@ COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) CCLD = $(CC) LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@ -SOURCES = add.c bp.c divi.c fib.c fibdelay.c fibit.c funcfp.c incr.c \ - ldxi.c modi.c movi.c printf.c printf2.c ret.c rpn.c rpnfp.c \ - testfp.c -DIST_SOURCES = add.c bp.c divi.c fib.c fibdelay.c fibit.c funcfp.c \ - incr.c ldxi.c modi.c movi.c printf.c printf2.c ret.c rpn.c \ - rpnfp.c testfp.c +SOURCES = add.c allocai.c bp.c divi.c fib.c fibdelay.c fibit.c \ + funcfp.c incr.c ldxi.c modi.c movi.c printf.c printf2.c ret.c \ + rpn.c rpnfp.c testfp.c +DIST_SOURCES = add.c allocai.c bp.c divi.c fib.c fibdelay.c fibit.c \ + funcfp.c incr.c ldxi.c modi.c movi.c printf.c printf2.c ret.c \ + rpn.c rpnfp.c testfp.c DATA = $(noinst_DATA) ETAGS = etags CTAGS = ctags @@ -242,12 +247,13 @@ target_vendor = @target_vendor@ AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir) -I$(top_srcdir)/lightning/$(cpu) noinst_DATA = fibit.ok incr.ok printf.ok printf2.ok rpn.ok \ fib.ok fibdelay.ok testfp.ok funcfp.ok rpnfp.ok add.ok \ - bp.ok modi.ok ldxi.ok divi.ok movi.ok ret.ok + bp.ok modi.ok ldxi.ok divi.ok movi.ok ret.ok \ + allocai.ok EXTRA_DIST = $(noinst_DATA) run-test @DISASS_TRUE@LDADD = $(top_builddir)/opcode/libdisass.a @REGRESSION_TESTING_TRUE@TESTS = fib fibit fibdelay incr printf printf2 rpn add bp \ -@REGRESSION_TESTING_TRUE@ testfp funcfp rpnfp modi ldxi divi movi ret +@REGRESSION_TESTING_TRUE@ testfp funcfp rpnfp modi ldxi divi movi ret allocai @REGRESSION_TESTING_TRUE@TESTS_ENVIRONMENT = $(srcdir)/run-test all: all-am @@ -289,6 +295,9 @@ clean-checkPROGRAMS: add$(EXEEXT): $(add_OBJECTS) $(add_DEPENDENCIES) @rm -f add$(EXEEXT) $(LINK) $(add_LDFLAGS) $(add_OBJECTS) $(add_LDADD) $(LIBS) +allocai$(EXEEXT): $(allocai_OBJECTS) $(allocai_DEPENDENCIES) + @rm -f allocai$(EXEEXT) + $(LINK) $(allocai_LDFLAGS) $(allocai_OBJECTS) $(allocai_LDADD) $(LIBS) bp$(EXEEXT): $(bp_OBJECTS) $(bp_DEPENDENCIES) @rm -f bp$(EXEEXT) $(LINK) $(bp_LDFLAGS) $(bp_OBJECTS) $(bp_LDADD) $(LIBS) @@ -345,6 +354,7 @@ distclean-compile: -rm -f *.tab.c @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/add.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/allocai.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/bp.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/divi.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fib.Po@am__quote@