big merge

git-archimport-id: bonzini@gnu.org--2004b/lightning--stable--1.2--patch-1 git-archimport-id: bonzini@gnu.org--2004b/lightning--stable--1.2--patch-2
2025-06-21 03:00:19 +02:00 · 2004-10-14 16:10:07 +00:00 · 2004-10-14 16:10:07 +00:00 · ba5044a668
commit ba5044a668
parent a72bbf2f6a
49 changed files with 2022 additions and 1059 deletions
--- a/.cvsignore
+++ b/.cvsignore
@ -0,0 +1 @@
+autom4te.cache
--- a/1
+++ b/1
@ -1,2 +1,3 @@
 Paolo Bonzini <bonzini@gnu.org>
 i386 and PPC assemblers by Ian Piumarta <piumarta@inria.fr>
+Major PPC contributions by Laurent Michel <ldm@thorgal.homelinux.org>
--- a/214
+++ b/214
@ -1,3 +1,217 @@
+2004-10-12  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/i386/fp.h: Fix bugs in conditional branches.
+
+2004-10-10  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/i386/funcs.h: Fix pasto in jit_flush_code.
+
+2004-10-08  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/ppc/fp.h: Optimized conditional branches.
+
+2004-09-20  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/ppc/asm.h: Fix more typos.
+
+2004-09-20  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/ppc/asm.h: Fix typos, replace `26' with JIT_AUX.
+
+2004-09-20  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/ppc/fp.h: Added conditional branches.
+
+2004-09-18  Laurent Michel  <ldm@thorgal.homelinux.org>
+
+	* lightning/ppc/fp.h (jit_unler_d, jit_unltr_d, jit_unger_d,
+	jit_ungtr_d, jit_ltgt_d, jit_uneq_d): Implemented missing tests
+	to fully support testfp.
+	(jit_floorr_d_i, jit_ceilr_d_i, jit_roundr_d_i, jit_truncr_d_i):
+	New macros.
+	* lightning/ppc/asm.h: Added missing opcodes FCTIWZ and MTFSFI.
+	* lightning/ppc/funcs.h (_jit_prolog): Fixed minor mistake in
+	the initialization of _jitl.nextarg_geti, relying on the
+	JIT_AUX macro as well to get the register offset.
+
+2004-09-07  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/ppc/funcs.h: Fix typo.
+
+2004-09-06  Paolo Bonzini  <bonzini@gnu.org>
+
+	* tests/funcfp.c: Use %g.  Remove C99 variable declarations.
+	* tests/testfp.c: Don't use __builtin_nan.
+
+	* lightning/ppc/core.h: Add three V registers.
+	* lightning/ppc/funcs.h: Adjust.
+
+	* lightning/sparc/core.h: Some fixes related to FP argument passing.
+	Move R0 to %g2, use %o7 for JIT_BIG2.
+	* lightning/sparc/fp.h: Some fixes related to FP argument passing.
+
+2004-09-02  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/sparc/core.h: Add another V register,
+	move R0 to %o7.
+
+2004-07-15  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/i386/funcs.h: Implement jit_flush_code,
+	in order to support Fedora's exec-shield.
+
+2004-07-14  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/core-common.h: Add more jit_extr_*_* macros.
+	* lightning/doc/using.texi: Be clearer about the order
+	of arguments in jit_extr_*_*.
+	* lightning/doc/porting.texi: Add more jit_extr_*_* macros.
+	* lightning/i386/fp.h: Fix typo in jit_extr_i_d.
+
+2004-07-14  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/ppc/funcs.h: Adjust offset of LR into
+	stack frame if running under the Darwin ABI.
+
+2004-07-13  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/i386/fp.h: Rename jit_exti_d to jit_extr_i_d.
+
+2004-07-13  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/ppc/core.h: Fix thinko.
+
+	* lightning/i386/core.h: Fix jit_lti_ui.
+	* lightning/core-common.h: Add missing macros.
+
+	* lightning/ppc/fp.h: Rename jit_neg_* to jit_negr_*.
+	* lightning/i386/fp.h: Rename jit_neg_* to jit_negr_*.
+	* lightning/sparc/fp.h: Rename jit_neg_* to jit_negr_*.
+	* lightning/fp-common.h: Rename jit_neg_* to jit_negr_*.
+	* doc/porting.texi: Add undocumented macros.
+
+2004-07-12  Paolo Bonzini  <bonzini@gnu.org>
+
+	* doc/porting.texi: Add missing macros.
+
+2004-07-12  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/ppc/funcs.h: Don't generate trampolines.
+	Separate prolog and epilog generation.
+	* lightning/ppc/core.h: Generate epilog explicitly.
+	Don't reserve r31 anymore.
+	* lightning/core-common.h: Remove call to jit_setup_code.
+
+2004-07-09  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/lightning.h.in: Avoid preprocessor warnings.
+	* lightning/lightning-inst.h: Likewise.
+
+	* lightning/i386/core.h: Define JIT_R, JIT_R_NUM, JIT_V,
+	JIT_V_NUM.
+	* lightning/ppc/core.h: Likewise.
+	* lightning/sparc/core.h: Likewise.
+	* lightning/i386/fp.h: Define JIT_FPR, JIT_FPR_NUM.
+	* lightning/ppc/fp.h: Likewise.
+	* lightning/sparc/fp.h: Likewise.
+	* lightning/core-common.h: Define fixed register names.
+	* lightning/fp-common.h: Likewise for FP regs.
+
+2004-07-09  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/ppc/funcs.h: Fix location where return address
+	is stored.
+	* lightning/i386/asm.h: Add a trailing _ to opcodes without
+	any parameter.
+	* lightning/i386/core.h: Adjust for the above.
+
+2004-04-15  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/i386/fp.h: Change "and" to "_and"
+	to satisfy C++ compilers.
+
+2004-04-14  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/sparc/fp.h: Use memcpy to implement jit_movi.
+	* lightning/ppc/fp.h: Use memcpy to implement jit_movi.
+	Move floating-point opcodes...
+	* lightning/ppc/asm.h: ... here.
+
+2004-04-14  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/core-common.h: Add jit_finishr.
+	* lightning/ppc/core.h: Add jit_callr and jit_finishr.
+	* lightning/i386/core.h: Add jit_callr.
+	* lightning/sparc/core.h: Add jit_callr.  Fix typo.
+
+2004-04-14  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/i386/core.h: Fix pasto in jit_b*_ui.
+
+2004-03-30  Laurent Michel
+
+	* lightning/ppc: Implement PowerPC floating point
+	(ChangeLog entry missing).
+
+2004-03-12  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/fp-common.h: Load/store macros are not the
+	same for floats and doubles anywhere, but jit_retval may be.
+	* lightning/i386/asm.h: Fix = mistaken for == in ESCrri.
+	* lightning/i386/core.h: Fix typo in jit_prepare_[fd].
+	* lightning/i386/fp.h: Rewritten.
+	* tests/testfp.c: Add tests for unordered comparisons.
+	* tests/testfp.ok: Add results.
+
+2004-03-15  Paolo Bonzini  <bonzini@gnu.org>
+
+	Merge changes from Laurent Michel.
+
+	* lightning/asm-common.h: Add _jit_I_noinc.
+	* lightning/core-common.h: Support jit_init,
+	jit_setup_code, jit_patch_at.  Return patchable IP from
+	jit_movi_p.
+	* lightning/funcs-common.h: Provide defaults
+	for jit_setup_code, jit_start_pfx, jit_end_pfx
+	* lightning/i386/core.h: Add jit_patch_at, jit_patch_movi.
+	* lightning/ppc/core.h: Likewise.
+	* lightning/sparc/core.h: Likewise.
+	* lightning/ppc/asm.h: Fix generation of branch destination
+	displacements in _FB and _BB
+	* lightning/ppc/core.h: Generate trampolines in the user
+	area.
+	* lightning/ppc/funcs.h: Add a few casts.
+	* tests/bc.c: New testcase.
+
+	* lightning/i386/asm.h: Wrap into #ifndef LIGHTNING_DEBUG.
+	* lightning/ppc/asm.h: Wrap into #ifndef LIGHTNING_DEBUG.
+	* lightning/sparc/asm.h: Wrap into #ifndef LIGHTNING_DEBUG.
+
+
+2004-03-09  Paolo Bonzini  <bonzini@gnu.org>
+
+	* lightning/sparc/fp.h: Rewrite.  Move macros for
+	FP code generation...
+	* lightning/sparc/asm.h: ... here.
+	* lightning/sparc/core.h: Rename jit_prepare to
+	jit_prepare_i, jit_retval to jit_retval_i.
+	* lightning/ppc/core.h: Rename jit_prepare to
+	jit_prepare_i, jit_retval to jit_retval_i.
+	* lightning/i386/core.h: Rename jit_prepare to
+	jit_prepare_i, jit_retval to jit_retval_i.
+	* lightning/core-common.h: Provide backwards
+	compatible synonyms for the above.
+	* lightning/fp-common.h: Rewrite.
+	* lightning-inst.h: Include fp unconditionally.
+	* lightning.h.in: Include fp unconditionally.
+	* tests/Makefile.am: Enable fp tests.
+	* tests/fib.c: Use jit_retval_i.
+	* tests/fibit.c: Cast codeBuffer to char *.
+	* tests/funcfp.c: Use new fp macros.
+	* tests/printf.c: Use jit_retval_i.
+	* tests/rpnfp.c: Use new fp macros.
+	* tests/testfp.c: Use new fp macros.
+
 2004-03-02  Paolo Bonzini  <bonzini@gnu.org>

 	* lightning/i386/core.h: generate correct code when
--- a/11
+++ b/11
@ -1,10 +1,17 @@
 NEWS FROM VERSION 1.1.2 TO 1.2

-o   Floating-point interface rewritten, uses a common register
-    file architecture rather than a stack.
+o   Floating-point interface rewritten, uses a register file
+    architecture rather than a stack.

 o   Many bug fixes.

+o   jit_prepare and jit_retval are now jit_prepare_i and
+    jit_retval_i.
+
+o   Support for Fedora Core 1's exec-shield feature.
+
+o   PPC supports both SysV and Darwin ABIs.
+
 o   More (and more complete) examples provided.

 ---
--- a/8
+++ b/8
@ -0,0 +1,8 @@
+Thanks to all the following people for their help in
+improving GNU lightning:
+
+Tom Tromey                      <tromey@redhat.com>
+Laurent Michel                  <ldm@thorgal.homelinux.org>
+Eli Barzilay                    <eli@barzilay.org>
+Jens Troeger                    <savage@light-speed.de>
+Basile Starynkevitch            <basile@starynkevitch.net>
--- a/config/config.guess
+++ b/config/config.guess
@ -0,0 +1 @@
+/sw/share/automake-1.9/config.guess
--- a/config/config.sub
+++ b/config/config.sub
@ -0,0 +1 @@
+/sw/share/automake-1.9/config.sub
--- a/config/depcomp
+++ b/config/depcomp
@ -0,0 +1 @@
+/sw/share/automake-1.9/depcomp
--- a/config/mdate-sh
+++ b/config/mdate-sh
@ -0,0 +1 @@
+/sw/share/automake-1.9/mdate-sh
--- a/config/missing
+++ b/config/missing
@ -0,0 +1 @@
+/sw/share/automake-1.9/missing
--- a/config/texi2dvi
+++ b/config/texi2dvi
@ -1,6 +1,6 @@
 #! /bin/sh
 # texi2dvi --- produce DVI (or PDF) files from Texinfo (or LaTeX) sources.
-# $Id: texi2dvi,v 1.14 2003/02/05 00:42:33 karl Exp $
+# $Id: texi2dvi,v 1.1.1.1 2004/03/03 12:51:44 bonzini Exp $
 #
 # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2001,
 # 2002, 2003 Free Software Foundation, Inc.
@ -27,7 +27,7 @@
 # the `--debug' option when making a bug report.

 # This string is expanded by rcs automatically when this file is checked out.
-rcs_revision='$Revision: 1.14 $'
+rcs_revision='$Revision: 1.1.1.1 $'
 rcs_version=`set - $rcs_revision; echo $2`
 program=`echo $0 | sed -e 's!.*/!!'`
 version="texi2dvi (GNU Texinfo 4.5) $rcs_version
--- a/config/texinfo.tex
+++ b/config/texinfo.tex
@ -0,0 +1 @@
+/sw/share/automake-1.9/texinfo.tex
--- a/doc/.cvsignore
+++ b/doc/.cvsignore
@ -0,0 +1,3 @@
+*.info*
+stamp-*
+version.texi
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@ -1,5 +1,3 @@
-EXTRA_DIST=lightning.info lightning.info-1 lightning.info-2 lightning.info-3
-
 TEXI2DVI=$(top_srcdir)/config/texi2dvi
 HELP2MAN = $(top_srcdir)/config/help2man

--- a/doc/body.texi
+++ b/doc/body.texi
@ -51,7 +51,7 @@ There are no Secondary Sections, no Cover Texts and no Invariant Sections
 Info documentation, constitutes the Title Page.
@end titlepage

-@ifclear ISTEX
+@ifnottex
@node Top
@top @lightning{}

@ -61,17 +61,17 @@ which are usually either inefficient or non-portable, @lightning{} is
 both retargetable and very fast.

@include toc.texi
-@end ifclear
+@end ifnottex

@node Overview
@chapter Introduction to @lightning{}

-@ifset ISTEX
+@iftex
 This document describes @value{TOPIC} the @lightning{} library for
 dynamic code generation.  Unlike other dynamic code generation systems,
 which are usually either inefficient or non-portable, @lightning{} is
 both retargetable and very fast.
-@end ifset
+@end iftex

@ifclear USING
 This manual assumes that you are pretty comfortable with the usage of
--- a/doc/lightning.texi
+++ b/doc/lightning.texi
@ -36,11 +36,6 @@
@c Macros for Texinfo 3.1/4.0 compatibility
@c ---------------------------------------------------------------------

-@c Emulate the `@ifnottex' command which is found in Texinfo 4.0
-@iftex
-@set ISTEX
-@end iftex
-
@c @hlink (macro), @url and @email are used instead of @uref for Texinfo 3.1
@c compatibility
@macro hlink{url, link}
--- a/doc/porting.texi
+++ b/doc/porting.texi
@ -353,16 +353,20 @@ that make up the platform-independent interface provided by
 Implementation of forward references takes place in:

@itemize @bullet
-@bulletize The branch macros
-@bulletize The @code{jit_patch} macros
+@item
+The branch macros
+
+@item
+The @code{jit_patch_at} macros
@end itemize

 Roughly speaking, the branch macros, as seen in @usingref{GNU lightning
 macros, Generating code at run-time}, return a value that later calls
-to @code{jit_patch} use to complete the assembly of the forward
-reference.  This value is usually the contents of the program counter
-after the branch instruction is compiled (which is accessible in the
-@code{_jit.pc} variable).  Let's see an example from the x86 back-end:
+to @code{jit_patch} or @code{jit_patch_at} use to complete the assembly
+of the forward reference.  This value is usually the contents of the
+program counter after the branch instruction is compiled (which is
+accessible in the @code{_jit.pc} variable).  Let's see an example from
+the x86 back-end:

@example
 #define jit_bmsr_i(label, s1, s2)                            \
@ -374,7 +378,7 @@ the combination of a @code{TEST} instruction (bit-wise @sc{and} between
 the two operands) and a @code{JNZ} instruction (jump if non-zero).  The
 macro then returns the final value of the program counter.

-@code{jit_patch} is one of the few macros that need to possess a
+@code{jit_patch_at} is one of the few macros that need to possess a
 knowledge of the machine's instruction formats.  Its purpose is to
 patch a branch instruction (identified by the value returned at the
 moment the branch was compiled) to jump to the current position (that
@ -382,11 +386,11 @@ is, to the address identified by @code{_jit.pc}).

 On the x86, the displacement between the jump and the landing point is
 expressed as a 32-bit signed integer lying in the last four bytes of the
-jump instruction.  The definition of @code{_jit_patch} is:
+jump instruction.  The definition of @code{_jit_patch_at} is:

@example
-#define jit_patch(jump_pc)      (*_PSL((jump_pc) - 4) = \
-				  _jit.pc - (jump_pc))
+#define jit_patch(jump_pc, pv)    (*_PSL((jump_pc) - 4) = \
+				   (pv) - (jump_pc))
@end example

 The @code{_PSL} macro is nothing more than a cast to @code{long *},
@ -394,42 +398,69 @@ and is used here to shorten the definition and avoid cluttering it with
 excessive parentheses.  These type-cast macros are:

@itemize @bullet
-@bulletize @code{_PUC(X)} to cast to a @code{unsigned char *}.
-@bulletize @code{_PUS(X)} to cast to a @code{unsigned short *}.
-@bulletize @code{_PUI(X)} to cast to a @code{unsigned int *}.
-@bulletize @code{_PSL(X)} to cast to a @code{long *}.
-@bulletize @code{_PUL(X)} to cast to a @code{unsigned long *}.
+@item
+@code{_PUC(X)} to cast to a @code{unsigned char *}.
+
+@item
+@code{_PUS(X)} to cast to a @code{unsigned short *}.
+
+@item
+@code{_PUI(X)} to cast to a @code{unsigned int *}.
+
+@item
+@code{_PSL(X)} to cast to a @code{long *}.
+
+@item
+@code{_PUL(X)} to cast to a @code{unsigned long *}.
@end itemize

 On other platforms, notably RISC ones, the displacement is embedded into
-the instruction itself.  In this case, @code{jit_patch} must first zero
+the instruction itself.  In this case, @code{jit_patch_at} must first zero
 out the field, and then @sc{or} in the correct displacement.  The SPARC,
 for example, encodes the displacement in the bottom 22 bits; in addition
 the right-most two bits are suppressed, which are always zero because
 instruction have to be word-aligned.

@example
-#define jit_patch(delay_pc)   jit_patch_ ( ((delay_pc) - 1) )
+#define jit_patch_at(delay_pc, pv)   jit_patch_ (((delay_pc) - 1), (pv))

@rem{/* branch instructions return the address of the @emph{delay}
 * instruction---this is just a helper macro that makes the code more
 * readable.
 */}
-#define jit_patch_(jump_pc)   (*jump_pc =		    \
+#define jit_patch_(jump_pc, pv)   (*jump_pc =		    \
 	 (*jump_pc & ~_MASK(22)) |			    \
-         ((_UL(_jit.pc) - _UL(jump_pc)) >> 2) & _MASK(22))
+         ((_UL(pv) - _UL(jump_pc)) >> 2) & _MASK(22))
@end example

 This introduces more predefined shortcut macros:
@itemize @bullet
-@bulletize @code{_UC(X)} to cast to a @code{unsigned char}.
-@bulletize @code{_US(X)} to cast to a @code{unsigned short}.
-@bulletize @code{_UI(X)} to cast to a @code{unsigned int}.
-@bulletize @code{_SL(X)} to cast to a @code{long}.
-@bulletize @code{_UL(X)} to cast to a @code{unsigned long}.
-@bulletize @code{_MASK(N)} gives a binary number made of N ones.
+@item
+@code{_UC(X)} to cast to a @code{unsigned char}.
+
+@item
+@code{_US(X)} to cast to a @code{unsigned short}.
+
+@item
+@code{_UI(X)} to cast to a @code{unsigned int}.
+
+@item
+@code{_SL(X)} to cast to a @code{long}.
+
+@item
+@code{_UL(X)} to cast to a @code{unsigned long}.
+
+@item
+@code{_MASK(N)} gives a binary number made of N ones.
@end itemize

+Dual to branches and @code{jit_patch_at} are @code{jit_movi_p}
+and @code{jit_patch_movi}, since they can also be used to implement
+forward references.  @code{jit_movi_p} should be carefully implemented
+to use an encoding that is as long as possible, and it should return
+an address which is then passed to @code{jit_patch_movi}.  The
+implementation of @code{jit_patch_movi} is similar to
+@code{jit_patch_at}.

@node Common features
@section Common features supported by @file{core-common.h}
@ -448,14 +479,16 @@ avoids compiler warnings about redefined macros, but there should be
 no need to define them.  They are:
@example
 #define jit_extr_c_ui(d, rs)
-#define jit_extr_i_ul(d, rs)
 #define jit_extr_s_ui(d, rs)
+#define jit_extr_c_ul(d, rs)
+#define jit_extr_s_ul(d, rs)
+#define jit_extr_i_ul(d, rs)
 #define jit_negr_i(d, rs)
 #define jit_negr_l(d, rs)
@end example

@item Support for the @sc{abi}
-Both @code{jit_prolog}, @code{jit_leaf} and @code{jit_finish} are not
+All of @code{jit_prolog}, @code{jit_leaf} and @code{jit_finish} are not
 mandatory.  If not defined, they will be defined respectively as an
 empty macro, as a synonym for @code{jit_prolog}, and as a synonym for
@code{jit_calli}.  Whether to define them or not in the port-specific
@ -471,8 +504,12 @@ and ``reverse subtraction'' (that is, REG2@math{=}IMM@math{-}REG1):
@example
 #define jit_extr_c_i(d, rs)
 #define jit_extr_s_i(d, rs)
+#define jit_extr_c_l(d, rs)
+#define jit_extr_s_l(d, rs)
+#define jit_extr_i_l(d, rs)
 #define jit_rsbi_i(d, rs, is)
 #define jit_rsbi_l(d, rs, is)
+#define jit_rsbi_p(d, rs, is)
@end example

@item Conversion between network and host byte ordering
@ -510,7 +547,7 @@ unsigned integers is exactly the same as adding two signed integers
@lightning{} provides both @code{jit_addr_i} and @code{jit_addr_ui}
 macros.  Similarly, pointers and unsigned long integers behave in the
 same way, but @lightning{} has separate instruction for the two data
-types---those that operate on pointers usually comprise a typecast
+types---those that operate on pointers usually include a typecast
 that makes programs clearer.

@item Shortcuts
@ -553,7 +590,7 @@ instruction to be scheduled in the delay slot with the branch
 instruction.  The only parameter accepted by the macro is a call
 to a branch macro, which must be expanded @strong{exactly once} by
@code{jit_fill_delay_after}.  The client must be able to pass the
-return value of @code{jit_fill_delay_after} to @code{jit_patch}.
+return value of @code{jit_fill_delay_after} to @code{jit_patch_at}.

 There are two possible approaches that can be used in
@code{jit_fill_delay_after}.  They are summarized in the following
@ -701,7 +738,7 @@ in @file{core-common.h} (@pxref{Common features, , Common features
 supported by @file{core-common.h}}).

@example
-#define jit_prepare(numargs)  (_jitl.pusharg = _Ro(numargs))
+#define jit_prepare_i(numargs)  (_jitl.pusharg = _Ro(numargs))
 #define jit_pusharg_i(rs)       (--_jitl.pusharg,         \
                                 MOVrr((rs), _jitl.pusharg))
@end example
@ -759,18 +796,18 @@ epilog code.
@code{jit_pusharg} uses a hardware push operation, which is commonly
 available on CISC machines (where this approach is most likely
 followed).  Since the stack has to be cleaned up after the call,
-@code{jit_prepare} remembers how many parameters have been put there,
+@code{jit_prepare_i} remembers how many parameters have been put there,
 and @code{jit_finish} adjusts the stack pointer after the call.

@example
-#define jit_prepare(numargs) (_jitl.args += (numargs))
+#define jit_prepare_i(numargs) (_jitl.args += (numargs))
 #define jit_pusharg_i(rs)      PUSHLr(rs)
 #define jit_finish(sub)        (jit_calli((sub)),              \
                               ADDLir(4 * _jitl.args, JIT_SP), \
                               _jitl.numargs = 0)
@end example

-Note the usage of @code{+=} in @code{jit_prepare}.  This is done
+Note the usage of @code{+=} in @code{jit_prepare_i}.  This is done
 so that one can defer the popping of the arguments that were saved
 on the stack (@dfn{stack pollution}).  To do so, it is sufficient to
 use @code{jit_calli} instead of @code{jit_finish} in all but the
@ -823,12 +860,12 @@ operations:
@table @b
@item Register names (all mandatory but the last two)
@example
-#define JIT_R0
-#define JIT_R1
-#define JIT_R2
-#define JIT_V0
-#define JIT_V1
-#define JIT_V2
+#define JIT_R
+#define JIT_R_NUM
+#define JIT_V
+#define JIT_V_NUM
+#define JIT_FPR
+#define JIT_FPR_NUM
 #define JIT_SP
 #define JIT_FP
 #define JIT_RZERO
@ -850,57 +887,81 @@ operations:
 #define jit_arg_ui()
 #define jit_arg_ul()
 #define jit_arg_us()
+#define jit_abs_d(rd,rs)
 #define jit_addi_i(d, rs, is)
+#define jit_addr_d(rd,s1,s2)
 #define jit_addr_i(d, s1, s2)
 #define jit_addxi_i(d, rs, is)
 #define jit_addxr_i(d, s1, s2)
 #define jit_andi_i(d, rs, is)
 #define jit_andr_i(d, s1, s2)
 #define jit_beqi_i(label, rs, is)
+#define jit_beqr_d(label, s1, s2)
 #define jit_beqr_i(label, s1, s2)
 #define jit_bgei_i(label, rs, is)
 #define jit_bgei_ui(label, rs, is)
+#define jit_bger_d(label, s1, s2)
 #define jit_bger_i(label, s1, s2)
 #define jit_bger_ui(label, s1, s2)
 #define jit_bgti_i(label, rs, is)
 #define jit_bgti_ui(label, rs, is)
+#define jit_bgtr_d(label, s1, s2)
 #define jit_bgtr_i(label, s1, s2)
 #define jit_bgtr_ui(label, s1, s2)
 #define jit_blei_i(label, rs, is)
 #define jit_blei_ui(label, rs, is)
+#define jit_bler_d(label, s1, s2)
 #define jit_bler_i(label, s1, s2)
 #define jit_bler_ui(label, s1, s2)
+#define jit_bltgtr_d(label, s1, s2)
 #define jit_blti_i(label, rs, is)
 #define jit_blti_ui(label, rs, is)
+#define jit_bltr_d(label, s1, s2)
 #define jit_bltr_i(label, s1, s2)
 #define jit_bltr_ui(label, s1, s2)
-#define jit_boaddi_i(label, rs, is)
-#define jit_boaddi_ui(label, rs, is)
-#define jit_boaddr_i(label, s1, s2)
-#define jit_boaddr_ui(label, s1, s2)
-#define jit_bosubi_i(label, rs, is)
-#define jit_bosubi_ui(label, rs, is)
-#define jit_bosubr_i(label, s1, s2)
-#define jit_bosubr_ui(label, s1, s2)
 #define jit_bmci_i(label, rs, is)
 #define jit_bmcr_i(label, s1, s2)
 #define jit_bmsi_i(label, rs, is)
 #define jit_bmsr_i(label, s1, s2)
 #define jit_bnei_i(label, rs, is)
+#define jit_bner_d(label, s1, s2)
 #define jit_bner_i(label, s1, s2)
+#define jit_boaddi_i(label, rs, is)
+#define jit_boaddi_ui(label, rs, is)
+#define jit_boaddr_i(label, s1, s2)
+#define jit_boaddr_ui(label, s1, s2)
+#define jit_bordr_d(label, s1, s2)
+#define jit_bosubi_i(label, rs, is)
+#define jit_bosubi_ui(label, rs, is)
+#define jit_bosubr_i(label, s1, s2)
+#define jit_bosubr_ui(label, s1, s2)
+#define jit_buneqr_d(label, s1, s2)
+#define jit_bunger_d(label, s1, s2)
+#define jit_bungtr_d(label, s1, s2)
+#define jit_bunler_d(label, s1, s2)
+#define jit_bunltr_d(label, s1, s2)
+#define jit_bunordr_d(label, s1, s2)
 #define jit_calli(label)
+#define jit_callr(label)
+#define jit_ceilr_d_i(rd, rs)
 #define jit_divi_i(d, rs, is)
 #define jit_divi_ui(d, rs, is)
+#define jit_divr_d(rd,s1,s2)
 #define jit_divr_i(d, s1, s2)
 #define jit_divr_ui(d, s1, s2)
 #define jit_eqi_i(d, rs, is)
+#define jit_eqr_d(d, s1, s2)
 #define jit_eqr_i(d, s1, s2)
+#define jit_extr_i_d(rd, rs)
+#define jit_floorr_d_i(rd, rs)
 #define jit_gei_i(d, rs, is)
 #define jit_gei_ui(d, s1, s2)
+#define jit_ger_d(d, s1, s2)
 #define jit_ger_i(d, s1, s2)
 #define jit_ger_ui(d, s1, s2)
 #define jit_gti_i(d, rs, is)
 #define jit_gti_ui(d, s1, s2)
+#define jit_gtr_d(d, s1, s2)
 #define jit_gtr_i(d, s1, s2)
 #define jit_gtr_ui(d, s1, s2)
 #define jit_hmuli_i(d, rs, is)
@ -909,61 +970,93 @@ operations:
 #define jit_hmulr_ui(d, s1, s2)
 #define jit_jmpi(label)
 #define jit_jmpr(reg)
+#define jit_ldxi_f(rd, rs, is)
+#define jit_ldxr_f(rd, s1, s2)
 #define jit_ldxi_c(d, rs, is)
+#define jit_ldxi_d(rd, rs, is)
 #define jit_ldxi_i(d, rs, is)
 #define jit_ldxi_s(d, rs, is)
 #define jit_ldxi_uc(d, rs, is)
 #define jit_ldxi_us(d, rs, is)
 #define jit_ldxr_c(d, s1, s2)
+#define jit_ldxr_d(rd, s1, s2)
 #define jit_ldxr_i(d, s1, s2)
 #define jit_ldxr_s(d, s1, s2)
 #define jit_ldxr_uc(d, s1, s2)
 #define jit_ldxr_us(d, s1, s2)
 #define jit_lei_i(d, rs, is)
 #define jit_lei_ui(d, s1, s2)
+#define jit_ler_d(d, s1, s2)
 #define jit_ler_i(d, s1, s2)
 #define jit_ler_ui(d, s1, s2)
 #define jit_lshi_i(d, rs, is)
 #define jit_lshr_i(d, r1, r2)
+#define jit_ltgtr_d(d, s1, s2)
 #define jit_lti_i(d, rs, is)
 #define jit_lti_ui(d, s1, s2)
+#define jit_ltr_d(d, s1, s2)
 #define jit_ltr_i(d, s1, s2)
 #define jit_ltr_ui(d, s1, s2)
 #define jit_modi_i(d, rs, is)
 #define jit_modi_ui(d, rs, is)
 #define jit_modr_i(d, s1, s2)
 #define jit_modr_ui(d, s1, s2)
+#define jit_movi_d(rd,immd)
+#define jit_movi_f(rd,immf)
 #define jit_movi_i(d, is)
+#define jit_movi_p(d, is)
+#define jit_movr_d(rd,rs)
 #define jit_movr_i(d, rs)
 #define jit_muli_i(d, rs, is)
 #define jit_muli_ui(d, rs, is)
+#define jit_mulr_d(rd,s1,s2)
 #define jit_mulr_i(d, s1, s2)
 #define jit_mulr_ui(d, s1, s2)
+#define jit_negr_d(rd,rs)
 #define jit_nei_i(d, rs, is)
+#define jit_ner_d(d, s1, s2)
 #define jit_ner_i(d, s1, s2)
 #define jit_nop()
+#define jit_ordr_d(d, s1, s2)
 #define jit_ori_i(d, rs, is)
 #define jit_orr_i(d, s1, s2)
-#define jit_patch(jump_pc)
+#define jit_patch_at(jump_pc, value)
+#define jit_patch_movi(jump_pc, value)
 #define jit_pop_i(rs)
-#define jit_prepare(numargs)
+#define jit_prepare_d(numargs)
+#define jit_prepare_f(numargs)
+#define jit_prepare_i(numargs)
 #define jit_push_i(rs)
 #define jit_pusharg_i(rs)
 #define jit_ret()
 #define jit_retval_i(rd)
+#define jit_roundr_d_i(rd, rs)
 #define jit_rshi_i(d, rs, is)
 #define jit_rshi_ui(d, rs, is)
 #define jit_rshr_i(d, r1, r2)
 #define jit_rshr_ui(d, r1, r2)
+#define jit_sqrt_d(rd,rs)
 #define jit_stxi_c(rd, id, rs)
+#define jit_stxi_d(id, rd, rs)
+#define jit_stxi_f(id, rd, rs)
 #define jit_stxi_i(rd, id, rs)
 #define jit_stxi_s(rd, id, rs)
 #define jit_stxr_c(d1, d2, rs)
+#define jit_stxr_d(d1, d2, rs)
+#define jit_stxr_f(d1, d2, rs)
 #define jit_stxr_i(d1, d2, rs)
 #define jit_stxr_s(d1, d2, rs)
+#define jit_subr_d(rd,s1,s2)
 #define jit_subr_i(d, s1, s2)
 #define jit_subxi_i(d, rs, is)
 #define jit_subxr_i(d, s1, s2)
+#define jit_truncr_d_i(rd, rs)
+#define jit_uneqr_d(d, s1, s2)
+#define jit_unger_d(d, s1, s2)
+#define jit_ungtr_d(d, s1, s2)
+#define jit_unler_d(d, s1, s2)
+#define jit_unltr_d(d, s1, s2)
+#define jit_unordr_d(d, s1, s2)
 #define jit_xori_i(d, rs, is)
 #define jit_xorr_i(d, s1, s2)
@end example
@ -971,17 +1064,20 @@ operations:
@item Non mandatory---there should be no need to define them:
@example
 #define jit_extr_c_ui(d, rs)
-#define jit_extr_i_ul(d, rs)
 #define jit_extr_s_ui(d, rs)
+#define jit_extr_c_ul(d, rs)
+#define jit_extr_s_ul(d, rs)
+#define jit_extr_i_ul(d, rs)
 #define jit_negr_i(d, rs)
 #define jit_negr_l(d, rs)
@end example

@item Non mandatory---whether to define them depends on the @sc{abi}:
@example
-#define jit_prolog()
-#define jit_finish()
-#define jit_leaf()
+#define jit_prolog(n)
+#define jit_finish(sub)
+#define jit_finishr(reg)
+#define jit_leaf(n)
 #define jit_getarg_c(reg, ofs)
 #define jit_getarg_i(reg, ofs)
 #define jit_getarg_l(reg, ofs)
@ -991,12 +1087,17 @@ operations:
 #define jit_getarg_ui(reg, ofs)
 #define jit_getarg_ul(reg, ofs)
 #define jit_getarg_us(reg, ofs)
+#define jit_getarg_f(reg, ofs)
+#define jit_getarg_d(reg, ofs)
@end example

@item Non mandatory---define them if instructions that do this exist:
@example
 #define jit_extr_c_i(d, rs)
 #define jit_extr_s_i(d, rs)
+#define jit_extr_c_l(d, rs)
+#define jit_extr_s_l(d, rs)
+#define jit_extr_i_l(d, rs)
 #define jit_rsbi_i(d, rs, is)
 #define jit_rsbi_l(d, rs, is)
@end example
@ -1037,6 +1138,14 @@ operations:
 #define jit_str_c(rd, rs)
 #define jit_str_i(rd, rs)
 #define jit_str_s(rd, rs)
+#define jit_ldi_f(rd, is)
+#define jit_sti_f(id, rs)
+#define jit_ldi_d(rd, is)
+#define jit_sti_d(id, rs)
+#define jit_ldr_f(rd, rs)
+#define jit_str_f(rd, rs)
+#define jit_ldr_d(rd, rs)
+#define jit_str_d(rd, rs)
@end example

@item Synonyms---don't define them:
@ -1085,14 +1194,20 @@ operations:
 #define jit_eqr_p(d, s1, s2)
 #define jit_eqr_ui(d, s1, s2)
 #define jit_eqr_ul(d, s1, s2)
+#define jit_extr_c_s(d, rs)
+#define jit_extr_c_us(d, rs)
+#define jit_extr_uc_s(d, rs)
+#define jit_extr_uc_us(d, rs)
 #define jit_extr_uc_i(d, rs)
 #define jit_extr_uc_ui(d, rs)
-#define jit_extr_ui_l(d, rs)
-#define jit_extr_ui_l(d, rs)
-#define jit_extr_ui_ul(d, rs)
-#define jit_extr_ui_ul(d, rs)
 #define jit_extr_us_i(d, rs)
 #define jit_extr_us_ui(d, rs)
+#define jit_extr_uc_l(d, rs)
+#define jit_extr_uc_ul(d, rs)
+#define jit_extr_us_l(d, rs)
+#define jit_extr_us_ul(d, rs)
+#define jit_extr_ui_l(d, rs)
+#define jit_extr_ui_ul(d, rs)
 #define jit_gei_p(d, rs, is)
 #define jit_ger_p(d, s1, s2)
 #define jit_gti_p(d, rs, is)
@ -1145,8 +1260,10 @@ operations:
 #define jit_retval_ui(rd)
 #define jit_retval_ul(rd)
 #define jit_retval_us(rd)
+#define jit_rsbi_p(d, rs, is)
 #define jit_rsbi_ui(d, rs, is)
 #define jit_rsbi_ul(d, rs, is)
+#define jit_rsbr_p(d, rs, is)
 #define jit_rsbr_ui(d, s1, s2)
 #define jit_rsbr_ul(d, s1, s2)
 #define jit_sti_p(d, is)
@ -1175,6 +1292,12 @@ operations:
 #define jit_subr_p(d, s1, s2)
 #define jit_subr_ui(d, s1, s2)
 #define jit_subr_ul(d, s1, s2)
+#define jit_subxi_p(d, rs, is)
+#define jit_subxi_ui(d, rs, is)
+#define jit_subxi_ul(d, rs, is)
+#define jit_subxr_p(d, s1, s2)
+#define jit_subxr_ui(d, s1, s2)
+#define jit_subxr_ul(d, s1, s2)
 #define jit_xori_ui(d, rs, is)
 #define jit_xori_ul(d, rs, is)
 #define jit_xorr_ui(d, s1, s2)
@ -1183,6 +1306,19 @@ operations:

@item Shortcuts---don't define them:
@example
+#define JIT_R0
+#define JIT_R1
+#define JIT_R2
+#define JIT_V0
+#define JIT_V1
+#define JIT_V2
+#define JIT_FPR0
+#define JIT_FPR1
+#define JIT_FPR2
+#define JIT_FPR3
+#define JIT_FPR4
+#define JIT_FPR5
+#define jit_patch(jump_pc)
 #define jit_notr_c(d, rs)
 #define jit_notr_i(d, rs)
 #define jit_notr_l(d, rs)
@ -1191,12 +1327,61 @@ operations:
 #define jit_notr_ui(d, rs)
 #define jit_notr_ul(d, rs)
 #define jit_notr_us(d, rs)
+#define jit_rsbr_d(d, s1, s2)
 #define jit_rsbr_i(d, s1, s2)
 #define jit_rsbr_l(d, s1, s2)
 #define jit_subi_i(d, rs, is)
 #define jit_subi_l(d, rs, is)
@end example

+@item Mandatory unless target arithmetic is always done in the same precision:
+@example
+#define jit_abs_f(rd,rs)
+#define jit_addr_f(rd,s1,s2)
+#define jit_beqr_f(label, s1, s2)
+#define jit_bger_f(label, s1, s2)
+#define jit_bgtr_f(label, s1, s2)
+#define jit_bler_f(label, s1, s2)
+#define jit_bltgtr_f(label, s1, s2)
+#define jit_bltr_f(label, s1, s2)
+#define jit_bner_f(label, s1, s2)
+#define jit_bordr_f(label, s1, s2)
+#define jit_buneqr_f(label, s1, s2)
+#define jit_bunger_f(label, s1, s2)
+#define jit_bungtr_f(label, s1, s2)
+#define jit_bunler_f(label, s1, s2)
+#define jit_bunltr_f(label, s1, s2)
+#define jit_bunordr_f(label, s1, s2)
+#define jit_ceilr_f_i(rd, rs)
+#define jit_divr_f(rd,s1,s2)
+#define jit_eqr_f(d, s1, s2)
+#define jit_extr_d_f(rs, rd)
+#define jit_extr_f_d(rs, rd)
+#define jit_extr_i_f(rd, rs)
+#define jit_floorr_f_i(rd, rs)
+#define jit_ger_f(d, s1, s2)
+#define jit_gtr_f(d, s1, s2)
+#define jit_ler_f(d, s1, s2)
+#define jit_ltgtr_f(d, s1, s2)
+#define jit_ltr_f(d, s1, s2)
+#define jit_movr_f(rd,rs)
+#define jit_mulr_f(rd,s1,s2)
+#define jit_negr_f(rd,rs)
+#define jit_ner_f(d, s1, s2)
+#define jit_ordr_f(d, s1, s2)
+#define jit_roundr_f_i(rd, rs)
+#define jit_rsbr_f(d, s1, s2)
+#define jit_sqrt_f(rd,rs)
+#define jit_subr_f(rd,s1,s2)
+#define jit_truncr_f_i(rd, rs)
+#define jit_uneqr_f(d, s1, s2)
+#define jit_unger_f(d, s1, s2)
+#define jit_ungtr_f(d, s1, s2)
+#define jit_unler_f(d, s1, s2)
+#define jit_unltr_f(d, s1, s2)
+#define jit_unordr_f(d, s1, s2)
+@end example
+
@item Mandatory if sizeof(long) != sizeof(int)---don't define them on other systems:
@example
 #define jit_addi_l(d, rs, is)
@ -1241,6 +1426,12 @@ operations:
 #define jit_divr_ul(d, s1, s2)
 #define jit_eqi_l(d, rs, is)
 #define jit_eqr_l(d, s1, s2)
+#define jit_extr_c_l(d, rs)
+#define jit_extr_c_ul(d, rs)
+#define jit_extr_s_l(d, rs)
+#define jit_extr_s_ul(d, rs)
+#define jit_extr_i_l(d, rs)
+#define jit_extr_i_ul(d, rs)
 #define jit_gei_l(d, rs, is)
 #define jit_gei_ul(d, rs, is)
 #define jit_ger_l(d, s1, s2)
--- a/doc/toc.texi
+++ b/doc/toc.texi
@ -7,7 +7,6 @@
 * Installation::          Configuring and installing GNU lightning
 * The instruction set::   The RISC instruction set used i GNU lightning
 * GNU lightning macros::  GNU lightning's macros
-* Floating-point::        Doing floating point computations.
 * Reentrancy::            Re-entrant usage of GNU lightning
 * Autoconf support::      Using @code{autoconf} with GNU lightning
@end ifset
--- a/doc/using.texi
+++ b/doc/using.texi
@ -49,9 +49,14 @@ that closely match those of most existing RISC architectures, or
 that can be easily syntesized if absent.  Each instruction is composed
 of:
@itemize @bullet
-@bulletize an operation (like @code{sub} or @code{mul})
-@bulletize sometimes, an register/immediate flag (@code{r} or @code{i})
-@bulletize a type identifier (occasionally, two)
+@item
+an operation, like @code{sub} or @code{mul}
+
+@item
+sometimes, an register/immediate flag (@code{r} or @code{i})
+
+@item
+a type identifier or, occasionally, two
@end itemize

 The second and third field are separated by an underscore; thus,
@ -75,6 +80,8 @@ following table together with the C types they represent:
     ui         @r{unsigned int}
     l          @r{long}
     ul         @r{unsigned long}
+     f          @r{float}
+     d          @r{double}
     p          @r{void *}
@end example

@ -82,27 +89,31 @@ Some of these types may not be distinct: for example, (e.g., @code{l}
 is equivalent to @code{i} on 32-bit machines, and @code{p} is
 substantially equivalent to @code{ul}).

-There are seven registers, of which six are general-purpose, while
-the last is used to contain the stack pointer (@code{SP}).  The
-stack pointer can be used to allocate and access local variables
-on the stack (which is supposed to grow downwards in memory on all 
-architectures).
+There are at least seven integer registers, of which six are
+general-purpose, while the last is used to contain the stack pointer
+(@code{SP}).  The stack pointer can be used to allocate and access local
+variables on the stack (which is supposed to grow downwards in memory
+on all architectures).

-Of the six general-purpose registers, three are guaranteed to be
+Of the general-purpose registers, at least three are guaranteed to be
 preserved across function calls (@code{V0}, @code{V1} and
-@code{V2}) and three are not (@code{R0}, @code{R1} and
-@code{R2}).@footnote{Six registers are not very much, but this
+@code{V2}) and at least three are not (@code{R0}, @code{R1} and
+@code{R2}).  Six registers are not very much, but this
 restriction was forced by the need to target CISC architectures
-which, like the x86, are poor of registers.  Anyway, consider
-that even on a RISC architecture you don't have many more registers
-which are not devoted to function calls: on the SPARC, you have nine
-(@code{%g1} and the eight registers @code{%l0} through @code{%l7}).}
+which, like the x86, are poor of registers; anyway, backends can
+specify the actual number of available caller- and callee-save
+registers.

 In addition, there is a special @code{RET} register which contains
 the return value.  You should always remember, however, that writing
 this register could overwrite either a general-purpose register or
 an incoming parameter, depending on the architecture.

+There are at least six floating-point registers, named @code{FPR0} to
+@code{FPR5}.  These are separate from the integer registers on
+all the supported architectures; on Intel architectures, the
+register stack is mapped to a flat register file.
+
 The complete instruction set follows; as you can see, most non-memory
 operations only take integers, long integers (either signed or
 unsigned) and pointers as operands; this was done in order to reduce
@ -113,61 +124,117 @@ signed and in an unsigned way.

@table @b
@item Binary ALU operations
-These accept three operands, of which the last can be an immediate
-value.  @code{addx} operations must directly follow @code{addc}, and
+These accept three operands; the last one can be an immediate
+value for integer operands, or a register for all operand types.
+@code{addx} operations must directly follow @code{addc}, and
@code{subx} must follow @code{subc}; otherwise, results are undefined.
@example
-addr/addi    i  ui l  ul p  O1 = O2 + O3
-addxr/addxi  i  ui l  ul    O1 = O2 + (O3 + carry)
-addcr/addci  i  ui l  ul    O1 = O2 + O3, set carry
-subr/subi    i  ui l  ul p  O1 = O2 - O3
-subxr/subxi  i  ui l  ul    O1 = O2 - (O3 + carry)
-subcr/subci  i  ui l  ul    O1 = O2 - O3, set carry
-rsbr/rsbi    i  ui l  ul p  O1 = O3 - O2
-mulr/muli    i  ui l  ul    O1 = O2 * O3
-hmulr/hmuli  i  ui l  ul    O1 = @r{high bits of} O2 * O3
-divr/divi    i  ui l  ul    O1 = O2 / O3
-modr/modi    i  ui l  ul    O1 = O2 % O3
-andr/andi    i  ui l  ul    O1 = O2 & O3
-orr/ori      i  ui l  ul    O1 = O2 | O3
-xorr/xori    i  ui l  ul    O1 = O2 ^ O3
-lshr/lshi    i  ui l  ul    O1 = O2 << O3
-rshr/rshi    i  ui l  ul    O1 = O2 >> O3@footnote{The sign bit is propagated for signed types.}
+addr     i  ui  l  ul  p  f  d  O1 = O2 + O3
+addi     i  ui  l  ul  p        O1 = O2 + O3
+addxr    i  ui  l  ul           O1 = O2 + (O3 + carry)
+addxi    i  ui  l  ul           O1 = O2 + (O3 + carry)
+addcr    i  ui  l  ul           O1 = O2 + O3, set carry
+addci    i  ui  l  ul           O1 = O2 + O3, set carry
+subr     i  ui  l  ul  p  f  d  O1 = O2 - O3
+subi     i  ui  l  ul  p        O1 = O2 - O3
+subxr    i  ui  l  ul           O1 = O2 - (O3 + carry)
+subxi    i  ui  l  ul           O1 = O2 - (O3 + carry)
+subcr    i  ui  l  ul           O1 = O2 - O3, set carry
+subci    i  ui  l  ul           O1 = O2 - O3, set carry
+rsbr     i  ui  l  ul  p  f  d  O1 = O3 - O2
+rsbi     i  ui  l  ul  p        O1 = O3 - O2
+mulr     i  ui  l  ul     f  d  O1 = O2 * O3
+muli     i  ui  l  ul           O1 = O2 * O3
+hmulr    i  ui  l  ul           O1 = @r{high bits of} O2 * O3
+hmuli    i  ui  l  ul           O1 = @r{high bits of} O2 * O3
+divr     i  ui  l  ul     f  d  O1 = O2 / O3
+divi     i  ui  l  ul           O1 = O2 / O3
+modr     i  ui  l  ul           O1 = O2 % O3
+modi     i  ui  l  ul           O1 = O2 % O3
+andr     i  ui  l  ul           O1 = O2 & O3
+andi     i  ui  l  ul           O1 = O2 & O3
+orr      i  ui  l  ul           O1 = O2 | O3
+ori      i  ui  l  ul           O1 = O2 | O3
+xorr     i  ui  l  ul           O1 = O2 ^ O3
+xori     i  ui  l  ul           O1 = O2 ^ O3
+lshr     i  ui  l  ul           O1 = O2 << O3
+lshi     i  ui  l  ul           O1 = O2 << O3
+rshr     i  ui  l  ul           O1 = O2 >> O3@footnote{The sign bit is propagated for signed types.}
+rshi     i  ui  l  ul           O1 = O2 >> O3@footnote{The sign bit is propagated for signed types.}
@end example

@item Unary ALU operations
 These accept two operands, both of which must be registers.
@example
-negr        i     l         O1 = -O2
+negr     i     l         f  d  O1 = -O2
 notr     i  ui l  ul           O1 = ~O2
@end example

@item Compare instructions
-These accept three operands, of which the last can be an immediate
-value.  The last two operands are compared, and the first operand is
-set to either 0 or 1, according to whether the given condition was
-met or not.
+These accept three operands; again, the last can be an immediate
+value for integer data types.  The last two operands are compared,
+and the first operand is set to either 0 or 1, according to
+whether the given condition was met or not.
+
+The conditions given below are for the standard behavior of C,
+where the ``unordered'' comparison result is mapped to false.

@example
-ltr/lti     i ui l  ul p     O1 = (O2 <  O3)
-ler/lei     i ui l  ul p     O1 = (O2 <= O3)
-gtr/gti     i ui l  ul p     O1 = (O2 >  O3)
-ger/gei     i ui l  ul p     O1 = (O2 >= O3)
-eqr/eqi     i ui l  ul p     O1 = (O2 == O3)
-ner/nei     i ui l  ul p     O1 = (O2 != O3)
+ltr      i  ui  l  ul  p  f  d  O1 = (O2 <  O3)
+lti      i  ui  l  ul  p        O1 = (O2 <  O3)
+ler      i  ui  l  ul  p  f  d  O1 = (O2 <= O3)
+lei      i  ui  l  ul  p        O1 = (O2 <= O3)
+gtr      i  ui  l  ul  p  f  d  O1 = (O2 >  O3)
+gti      i  ui  l  ul  p        O1 = (O2 >  O3)
+ger      i  ui  l  ul  p  f  d  O1 = (O2 >= O3)
+gei      i  ui  l  ul  p        O1 = (O2 >= O3)
+eqr      i  ui  l  ul  p  f  d  O1 = (O2 == O3)
+eqi      i  ui  l  ul  p        O1 = (O2 == O3)
+ner      i  ui  l  ul  p  f  d  O1 = (O2 != O3)
+nei      i  ui  l  ul  p        O1 = (O2 != O3)
+unltr                     f  d  O1 = !(O2 >= O3)
+unler                     f  d  O1 = !(O2 >  O3)
+ungtr                     f  d  O1 = !(O2 <= O3)
+unger                     f  d  O1 = !(O2 <  O3)
+uneqr                     f  d  O1 = !(O2 <  O3) && !(O2 >  O3)
+ltgtr                     f  d  O1 = !(O2 >= O3) || !(O2 <= O3)
+ordr                      f  d  O1 =  (O2 == O2) &&  (O3 == O3)
+unordr                    f  d  O1 =  (O2 != O2) ||  (O3 != O3)
@end example

@item Transfer operations
 These accept two operands; for @code{ext} both of them must be
 registers, while @code{mov} accepts an immediate value as the second
-operand. @code{ext} needs @strong{two} data type specifications, of
-which the first must be smaller in size than the second; for example
-@code{extr_c_ui} is correct while @code{extr_ul_us} is not.
+operand.
+
+Unlike @code{movr} and @code{movi}, the other instructions are applied
+between operands of different data types, and they need @strong{two}
+data type specifications.  You can use @code{extr} to convert between
+integer data types, in which case the first must be smaller in size
+than the second; for example @code{extr_c_ui} is correct while
+@code{extr_ul_us} is not.  You can also use @code{extr} to convert
+an integer to a floating point value: the only available possibilities
+are @code{extr_i_f} and @code{extr_i_d}.  The other instructions
+convert a floating point value to an integer, so the possible
+suffixes are @code{_f_i} and @code{_d_i}.
+
@example
-movr/movi               i  ui l  ul p   O1 = O2
-extr        c  uc s  us i  ui l  ul     O1 = O2@footnote{Unlike @code{movr} and @code{movi}, @code{extr} is applied between operands of different sizes.}
+movr                      i  ui  l  ul  p  f  d  O1 = O2
+movi                      i  ui  l  ul  p  f  d  O1 = O2
+extr        c  uc  s  us  i  ui  l  ul     f  d  O1 = O2
+roundr                    i                f  d  O1 = round(O2)
+truncr                    i                f  d  O1 = trunc(O2)
+floorr                    i                f  d  O1 = floor(O2)
+ceilr                     i                f  d  O1 = ceil(O2)
@end example

+Note that the order of the arguments is @emph{destination first,
+source second} as for all other @lightning{} instructions, but
+the order of the types is always reversed with respect to that
+of the arguments: @emph{shorter}---source---@emph{first,
+longer}---destination---@emph{second}.  This happens for historical
+reasons.
+
@item Network extensions
 These accept two operands, both of which must be registers; these
 two instructions actually perform the same task, yet they are
@ -185,8 +252,10 @@ in both cases, the last can be either a register or an immediate
 value. Values are extended (with or without sign, according to
 the data type specification) to fit a whole register.
@example
-ldr/ldi     c  uc s  us i  ui l  ul p   O1 = *O2
-ldxr/ldxi   c  uc s  us i  ui l  ul p   O1 = *(O2+O3)
+ldr     c  uc  s  us  i  ui  l  ul  p  f  d  O1 = *O2
+ldi     c  uc  s  us  i  ui  l  ul  p  f  d  O1 = *O2
+ldxr    c  uc  s  us  i  ui  l  ul  p  f  d  O1 = *(O2+O3)
+ldxi    c  uc  s  us  i  ui  l  ul  p  f  d  O1 = *(O2+O3)
@end example

@item Store operations
@ -194,8 +263,10 @@ ldxr/ldxi   c  uc s  us i  ui l  ul p   O1 = *(O2+O3)
 both cases, the first can be either a register or an immediate
 value. Values are sign-extended to fit a whole register.
@example
-str/sti     c  uc s  us i  ui l  ul p   *O1 = O2
-stxr/stxi   c  uc s  us i  ui l  ul p   *(O1+O2) = O3
+str     c  uc  s  us  i  ui  l  ul  p  f  d  *O1 = O2
+sti     c  uc  s  us  i  ui  l  ul  p  f  d  *O1 = O2
+stxr    c  uc  s  us  i  ui  l  ul  p  f  d  *(O1+O2) = O3
+stxi    c  uc  s  us  i  ui  l  ul  p  f  d  *(O1+O2) = O3
@end example

@item Stack management
@ -210,19 +281,20 @@ popr                    i  ui l  ul p   @r{pop }O1@r{ off the stack}
@item Argument management
 These are:
@example
-prepare     (not specified)
-pusharg     c  uc s  us i  ui l  ul p
-getarg      c  uc s  us i  ui l  ul p
-arg         c  uc s  us i  ui l  ul p
+prepare                   i                f  d
+pusharg     c  uc  s  us  i  ui  l  ul  p  f  d
+getarg      c  uc  s  us  i  ui  l  ul  p  f  d
+arg         c  uc  s  us  i  ui  l  ul  p  f  d
@end example

 Of these, the first two are used by the caller, while the last two
 are used by the callee.  A code snippet that wants to call another
 procedure and has to pass registers must, in order: use the
@code{prepare} instruction, giving the number of arguments to
-be passed to the procedure; use @code{pusharg} to push the arguments
-@strong{in reverse order}; and use @code{calli} or @code{finish}
-(explained below) to perform the actual call.
+be passed to the procedure (once for each data type); use
+@code{pusharg} to push the arguments @strong{in reverse order};
+and use @code{calli} or @code{finish} (explained below) to
+perform the actual call.

@code{arg} and @code{getarg} are used by the callee.
@code{arg} is different from other instruction in that it does not
@ -269,18 +341,36 @@ is to be used to compile forward branches as explained in
 destination of the branch and two operands to be compared; of these,
 the last can be either a register or an immediate.  They are:
@example
-bltr/blti     i ui l  ul p    @r{if }O2 <  O3@r{ goto }O1
-bler/blei     i ui l  ul p    @r{if }O2 <= O3@r{ goto }O1
-bgtr/bgti     i ui l  ul p    @r{if }O2 >  O3@r{ goto }O1
-bger/bgei     i ui l  ul p    @r{if }O2 >= O3@r{ goto }O1
-beqr/beqi     i ui l  ul p    @r{if }O2 == O3@r{ goto }O1
-bner/bnei     i ui l  ul p    @r{if }O2 != O3@r{ goto }O1
+bltr      i  ui  l  ul  p  f  d  @r{if }(O2 <  O3)@r{ goto }O1
+blti      i  ui  l  ul  p        @r{if }(O2 <  O3)@r{ goto }O1
+bler      i  ui  l  ul  p  f  d  @r{if }(O2 <= O3)@r{ goto }O1
+blei      i  ui  l  ul  p        @r{if }(O2 <= O3)@r{ goto }O1
+bgtr      i  ui  l  ul  p  f  d  @r{if }(O2 >  O3)@r{ goto }O1
+bgti      i  ui  l  ul  p        @r{if }(O2 >  O3)@r{ goto }O1
+bger      i  ui  l  ul  p  f  d  @r{if }(O2 >= O3)@r{ goto }O1
+bgei      i  ui  l  ul  p        @r{if }(O2 >= O3)@r{ goto }O1
+beqr      i  ui  l  ul  p  f  d  @r{if }(O2 == O3)@r{ goto }O1
+beqi      i  ui  l  ul  p        @r{if }(O2 == O3)@r{ goto }O1
+bner      i  ui  l  ul  p  f  d  @r{if }(O2 != O3)@r{ goto }O1
+bnei      i  ui  l  ul  p        @r{if }(O2 != O3)@r{ goto }O1

-bmsr/bmsi     i ui l  ul      @r{if }O2 &  O3@r{ goto }O1
-bmcr/bmci     i ui l  ul      @r{if }!(O2 & O3)@r{ goto }O1@footnote{These two mnemonics mean, respectively, @dfn{branch if mask set} and @dfn{branch if mask cleared}.}
+bunltr                     f  d  @r{if }!(O2 >= O3)@r{ goto }O1
+bunler                     f  d  @r{if }!(O2 >  O3)@r{ goto }O1
+bungtr                     f  d  @r{if }!(O2 <= O3)@r{ goto }O1
+bunger                     f  d  @r{if }!(O2 <  O3)@r{ goto }O1
+buneqr                     f  d  @r{if }!(O2 <  O3) && !(O2 >  O3)@r{ goto }O1
+bltgtr                     f  d  @r{if }!(O2 >= O3) || !(O2 <= O3)@r{ goto }O1
+bordr                      f  d  @r{if } (O2 == O2) &&  (O3 == O3)@r{ goto }O1
+bunordr                    f  d  @r{if }!(O2 != O2) ||  (O3 != O3)@r{ goto }O1

-boaddr/boaddi i ui l  ul      O2 += O3@r{, goto }O1@r{ on overflow}
-bosubr/bosubi i ui l  ul      O2 -= O3@r{, goto }O1@r{ on overflow}
+bmsr      i ui l  ul             @r{if }O2 &  O3@r{ goto }O1
+bmsi      i ui l  ul             @r{if }O2 &  O3@r{ goto }O1
+bmcr      i ui l  ul             @r{if }!(O2 & O3)@r{ goto }O1
+bmci      i ui l  ul             @r{if }!(O2 & O3)@r{ goto }O1@footnote{These mnemonics mean, respectively, @dfn{branch if mask set} and @dfn{branch if mask cleared}.}
+boaddr    i ui l  ul             O2 += O3@r{, goto }O1@r{ on overflow}
+boaddi    i ui l  ul             O2 += O3@r{, goto }O1@r{ on overflow}
+bosubr    i ui l  ul             O2 -= O3@r{, goto }O1@r{ on overflow}
+bosubi    i ui l  ul             O2 -= O3@r{, goto }O1@r{ on overflow}
@end example

@item Jump and return operations
@ -292,12 +382,14 @@ instruction.  Results are undefined when using function calls
 in a leaf function.
@example
 calli     (not specified)                  @r{function call to O1}
+callr     (not specified)                  @r{function call to a register}
 finish    (not specified)                  @r{function call to O1}
+finishr   (not specified)                  @r{function call to a register}
 jmpi/jmpr (not specified)                  @r{unconditional jump to O1}
 prolog    (not specified)                  @r{function prolog for O1 args}
 leaf      (not specified)                  @r{the same for leaf functions}
 ret       (not specified)                  @r{return from subroutine}
-retval    c  uc s  us i  ui l  ul p  @r{move return value}
+retval    c  uc s  us i  ui l  ul p  f  d  @r{move return value}
                                           @r{to register}
@end example

@ -353,7 +445,7 @@ between parentheses, just like with every other @sc{cpp} macro.

 This small tutorial presents three examples:

-@ifset ISTEX
+@iftex
@itemize @bullet
@item
 The @code{incr} function found in @ref{The instruction set, ,
@ -368,15 +460,15 @@ An RPN calculator.
@item
 Fibonacci numbers
@end itemize
-@end ifset
-@ifclear ISTEX
+@end iftex
+@ifnottex
@menu
 * incr::             A function which increments a number by one
 * printf::           A simple function call to printf
 * RPN calculator::   A more complex example, an RPN calculator
 * Fibonacci::        Calculating Fibonacci numbers
@end menu
-@end ifclear
+@end ifnottex

@node incr
@section A function which increments a number by one
@ -931,8 +1023,23 @@ instruction; otherwise, it emits the delay instruction before the branch
 instruction.  The delay instruction must not depend on being executed
 before or after the branch.

-@node Floating-point
-@chapter Doing floating point computations
+Instead of @code{jit_patch}, you can use @code{jit_patch_at}, which
+takes two arguments: the first is the same as for @code{jit_patch}, and
+the second is the valued to be patched in.  In other words, these two
+invocations have the same effect:
+
+@example
+  jit_patch (jump_pc);
+  jit_patch_at (jump_pc, jit_get_ip ());
+@end example
+
+Dual to branches and @code{jit_patch_at} are @code{jit_movi_p}
+and @code{jit_patch_movi}, which can also be used to implement
+forward references.  @code{jit_movi_p} is carefully implemented
+to use an encoding that is as long as possible, so that it can
+always be patched; in addition, like branches, it will return
+an address which is then passed to @code{jit_patch_movi}.  The
+usage of @code{jit_patch_movi} is similar to @code{jit_patch_at}.

@node Reentrancy
@chapter Re-entrant usage of @lightning{}
@ -1040,6 +1147,22 @@ extern void _opt_muli_i(struct jit_state *, int, int, int);
@end example


+@section Registers
+@chapter Accessing the whole register file
+
+As mentioned earlier in this chapter, all @lightning{} back-ends
+are guaranteed to have at least six integer registers and six
+floating-point registers, but many back-ends will have more.
+
+To access the entire register files, you can use the
+@code{JIT_R}, @code{JIT_V} and @code{JIT_FPR} macros.  They
+accept a parameter that identifies the register number, which
+must be strictly less than @code{JIT_R_NUM}, @code{JIT_V_NUM}
+and @code{JIT_FPR_NUM} respectively; the number need not be
+constant.  Of course, expressions like @code{JIT_R0} and
+@code{JIT_R(0)} denote the same register, and likewise for
+integer callee-saved, or floating-point, registers.
+
@node Autoconf support
@chapter Using @code{autoconf} with @lightning{}

--- a/lightning-inst.h
+++ b/lightning-inst.h
@ -39,7 +39,6 @@ extern "C" {
 #endif

 #include <lightning/asm-common.h>
-#include <lightning/funcs-common.h>

 #ifndef LIGHTNING_DEBUG
 #include <lightning/asm.h>
@ -48,11 +47,9 @@ extern "C" {
 #include <lightning/core.h>
 #include <lightning/core-common.h>
 #include <lightning/funcs.h>
+#include <lightning/funcs-common.h>
 #include <lightning/fp.h>
-
-#ifdef jit_cmp
 #include <lightning/fp-common.h>
-#endif

 #ifndef JIT_R0
 #error GNU lightning does not support the current target
--- a/lightning.h.in
+++ b/lightning.h.in
@ -62,7 +62,6 @@ extern "C" {
 #endif

 #include <lightning/asm-common.h>
-#include <lightning/funcs-common.h>

 #ifndef LIGHTNING_DEBUG
 #include <lightning/asm.h>
@ -71,11 +70,9 @@ extern "C" {
 #include <lightning/core.h>
 #include <lightning/core-common.h>
 #include <lightning/funcs.h>
+#include <lightning/funcs-common.h>
 #include <lightning/fp.h>
-
-#ifdef jit_cmp
 #include <lightning/fp-common.h>
-#endif

 #ifdef LIGHTNING_DISASSEMBLE
 extern void disassemble(FILE *stream, char *from, char *to);
--- a/lightning/Makefile.am
+++ b/lightning/Makefile.am
@ -12,5 +12,5 @@ dist_pkgdata_DATA = Makefile.am
 nobase_dist_lightning_HEADERS = $(LIGHTNING_FILES)
 nodist_lightning_HEADERS = asm.h core.h funcs.h fp.h 
 else
-dist_noinst_HEADERS = $(LIGHTNING_FILES) lightning.h
+dist_noinst_HEADERS = $(LIGHTNING_FILES)
 endif
--- a/lightning/asm-common.h
+++ b/lightning/asm-common.h
@ -93,7 +93,6 @@ typedef unsigned long	_ul;
 #define _jit_UI(X)	((_ui  )(X))
 #define _jit_SL(X)	((_sl  )(X))
 #define _jit_UL(X)	((_ul  )(X))
-
 # define _PUC(X)	((_uc *)(X))
 # define _PUS(X)	((_us *)(X))
 # define _PUI(X)	((_ui *)(X))
@ -104,6 +103,7 @@ typedef unsigned long	_ul;
 #define _jit_W(W)         _jit_UL(((*_jit.x.us_pc++)= _jit_US((W)&0xffff)))
 #define _jit_I(I)         _jit_UL(((*_jit.x.ui_pc++)= _jit_UI((I)       )))
 #define _jit_L(L)         _jit_UL(((*_jit.x.ul_pc++)= _jit_UL((L)       )))
+#define _jit_I_noinc(I)   _jit_UL(((*_jit.x.ui_pc)=   _jit_UI((I)       )))

 #define _MASK(N)	((unsigned)((1<<(N)))-1)
 #define _siP(N,I)	(!((((unsigned)(I))^(((unsigned)(I))<<1))&~_MASK(N)))
--- a/lightning/core-common.h
+++ b/lightning/core-common.h
@ -45,14 +45,24 @@ typedef struct {
  struct jit_local_state jitl;
 } jit_state;

+#ifdef jit_init
+static jit_state 			_jit = jit_init ();
+#else
 static jit_state 			_jit;
+#endif

 #define JIT_NOREG			(-1)
+#define JIT_R0				JIT_R(0)
+#define JIT_R1				JIT_R(1)
+#define JIT_R2				JIT_R(2)
+#define JIT_V0				JIT_V(0)
+#define JIT_V1				JIT_V(1)
+#define JIT_V2				JIT_V(2)

 #define _jitl				_jit.jitl

 #define	jit_get_ip()			(*(jit_code *) &_jit.x.pc)
-#define	jit_set_ip(ptr)			(_jit.x.pc = (jit_insn *) ptr, jit_get_ip())
+#define	jit_set_ip(ptr)			(_jit.x.pc = (ptr), jit_get_ip ())
 #define	jit_get_label()			(_jit.x.pc)
 #define	jit_forward()			(_jit.x.pc)

@ -138,16 +148,24 @@ typedef union jit_code {
 #define jit_subci_ul(d, rs, is)		jit_subci_l((d), (rs), (is))	
 #define jit_subcr_ul(d, s1, s2)		jit_subcr_l((d), (s1), (s2))
 #define jit_subxi_ui(d, rs, is)		jit_subxi_i((d), (rs), (is))	
+#define jit_subxi_ul(d, rs, is)		jit_subxi_l((d), (rs), (is))	
 #define jit_subxr_ui(d, s1, s2)		jit_subxr_i((d), (s1), (s2))
+#define jit_subxr_ul(d, s1, s2)		jit_subxr_i((d), (s1), (s2))
 #define jit_xori_ul(d, rs, is)		jit_xori_l((d), (rs), (is))	
 #define jit_xorr_ul(d, s1, s2)		jit_xorr_l((d), (s1), (s2))

 #define jit_addr_p(d, s1, s2)		jit_addr_ul((d), (s1), 	      (s2))
 #define jit_addi_p(d, rs, is)		jit_addi_ul((d), (rs), (long) (is))
 #define jit_movr_p(d, rs)		jit_movr_ul((d),              (rs))
-#define jit_movi_p(d, is)		jit_movi_ul((d),       (long) (is))
 #define jit_subr_p(d, s1, s2)		jit_subr_ul((d), (s1),        (s2))
 #define jit_subi_p(d, rs, is)		jit_subi_ul((d), (rs), (long) (is))
+#define jit_rsbi_p(d, rs, is)		jit_rsbi_ul((d), (rs), (long) (is))
+
+#ifndef jit_movi_p
+#define jit_movi_p(d, is)		(jit_movi_ul((d),       (long) (is)), _jit.x.pc)
+#endif
+
+#define jit_patch(pv)        		jit_patch_at ((pv), (_jit.x.pc))

 #ifndef jit_addci_i
 #define jit_addci_i(d, rs, is)		jit_addi_i((d), (rs), (is))	
@ -190,8 +208,11 @@ typedef union jit_code {
 #define jit_subi_l(d, rs, is)		jit_addi_l((d), (rs), -(is))
 #define jit_subci_i(d, rs, is)		jit_addci_i((d), (rs), -(is))
 #define jit_subci_l(d, rs, is)		jit_addci_l((d), (rs), -(is))
+#define jit_rsbr_f(d, s1, s2)		jit_subr_f((d), (s2), (s1))
+#define jit_rsbr_d(d, s1, s2)		jit_subr_d((d), (s2), (s1))
 #define jit_rsbr_i(d, s1, s2)		jit_subr_i((d), (s2), (s1))
 #define jit_rsbr_l(d, s1, s2)		jit_subr_l((d), (s2), (s1))
+#define jit_rsbr_p(d, s1, s2)		jit_subr_p((d), (s2), (s1))

 /* Unary */
 #define jit_notr_c(d, rs)		jit_xori_c((d), (rs), 255)
@ -216,23 +237,43 @@ typedef union jit_code {
 #define jit_extr_s_i(d, rs)		(jit_lshi_i((d), (rs), 16), jit_rshi_i((d), (d), 16))
 #endif

+#ifdef jit_addi_l /* sizeof(long) != sizeof(int) */
+#ifndef jit_extr_c_l
+#define jit_extr_c_l(d, rs)		(jit_lshi_l((d), (rs), 56), jit_rshi_l((d), (d), 56))
+#endif
+#ifndef jit_extr_s_l
+#define jit_extr_s_l(d, rs)		(jit_lshi_l((d), (rs), 48), jit_rshi_l((d), (d), 48))
+#endif
+#ifndef jit_extr_i_l
+#define jit_extr_i_l(d, rs)		(jit_lshi_l((d), (rs), 32), jit_rshi_l((d), (d), 32))
+#endif
+#ifndef jit_extr_c_ul
+#define jit_extr_c_ul(d, rs)		jit_andi_l((d), (rs), 0xFF)
+#endif
+#ifndef jit_extr_s_ul
+#define jit_extr_s_ul(d, rs)		jit_andi_l((d), (rs), 0xFFFF)
+#endif
+#ifndef jit_extr_i_ul
+#define jit_extr_i_ul(d, rs)		jit_andi_l((d), (rs), 0xFFFFFFFFUL)
+#endif
+#endif

+#define jit_extr_c_s(d, rs)		jit_extr_c_i((d), (rs))
+#define jit_extr_c_us(d, rs)		jit_extr_c_ui((d), (rs))
+#define jit_extr_uc_s(d, rs)		jit_extr_uc_i((d), (rs))
+#define jit_extr_uc_us(d, rs)		jit_extr_uc_ui((d), (rs))
 #define jit_extr_uc_i(d, rs)		jit_extr_c_ui((d), (rs))
 #define jit_extr_uc_ui(d, rs)		jit_extr_c_ui((d), (rs))
 #define jit_extr_us_i(d, rs)		jit_extr_s_ui((d), (rs))
 #define jit_extr_us_ui(d, rs)		jit_extr_s_ui((d), (rs))
-
-#ifndef jit_extr_i_ul
-#ifdef jit_addi_l /* sizeof(long) != sizeof(int) */
-#define jit_extr_i_ul(d, rs)		jit_andi_ui((d), (rs), 0xFF)
-#else /* sizeof(long) == sizeof(int) */
-#define jit_extr_i_ul(d, rs)		jit_movr_i(d, rs)
-#endif /* sizeof(long) == sizeof(int) */
-#endif
-
+#define jit_extr_uc_l(d, rs)		jit_extr_c_ul((d), (rs))
+#define jit_extr_uc_ul(d, rs)		jit_extr_c_ul((d), (rs))
+#define jit_extr_us_l(d, rs)		jit_extr_s_ul((d), (rs))
+#define jit_extr_us_ul(d, rs)		jit_extr_s_ul((d), (rs))
 #define jit_extr_ui_l(d, rs)		jit_extr_i_ul((d), (rs))
 #define jit_extr_ui_ul(d, rs)		jit_extr_i_ul((d), (rs))

+
 /* NTOH/HTON is not mandatory for big endian architectures */
 #ifndef jit_ntoh_ui /* big endian */
 #define jit_ntoh_ui(d, rs)		((d) == (rs) ? (void)0 : jit_movr_i((d), (rs)))
@ -251,7 +292,7 @@ typedef union jit_code {
 #define jit_pushr_p(rs)			jit_pushr_ul(rs)
 #define jit_popr_p(rs)			jit_popr_ul(rs)		

-#define jit_prepare(nint)		jitfp_prepare((nint), 0, 0)
+#define jit_prepare(nint)		jit_prepare_i((nint))
 #define jit_pusharg_c(rs)		jit_pusharg_i(rs)
 #define jit_pusharg_s(rs)		jit_pusharg_i(rs)
 #define jit_pusharg_uc(rs)		jit_pusharg_i(rs)
@ -388,10 +429,17 @@ typedef union jit_code {
 #define jit_retval_c(rd)		jit_retval_i((rd))
 #define jit_retval_s(rd)		jit_retval_i((rd))

+/* This was a bug, but we keep it.  */
+#define jit_retval(rd)			jit_retval_i ((rd))
+
 #ifndef jit_finish
 #define jit_finish(sub)			jit_calli(sub)
 #endif

+#ifndef jit_finishr
+#define jit_finishr(reg)		jit_callr(reg)
+#endif
+
 #ifndef jit_prolog
 #define jit_prolog(numargs)
 #endif
@ -474,6 +522,14 @@ typedef union jit_code {
 #define jit_rshi_ul(d, rs, is)		jit_rshi_ui((d), (rs), (is))	
 #define jit_rshr_ul(d, s1, s2)		jit_rshr_ui((d), (s1), (s2))

+/* Sign/Zero extension */
+#define jit_extr_c_l(d, rs)		jit_extr_c_i(d, rs)
+#define jit_extr_c_ul(d, rs)		jit_extr_c_ui(d, rs)
+#define jit_extr_s_l(d, rs)		jit_extr_s_i(d, rs)
+#define jit_extr_s_ul(d, rs)		jit_extr_s_ui(d, rs)
+#define jit_extr_i_l(d, rs)		jit_movr_i(d, rs)
+#define jit_extr_i_ul(d, rs)		jit_movr_i(d, rs)
+
 /* Unary */
 #define jit_movi_l(d, rs)		jit_movi_i((d), (rs))
 #define jit_movr_l(d, rs)		jit_movr_i((d), (rs))
--- a/lightning/fp-common.h
+++ b/lightning/fp-common.h
@ -29,232 +29,58 @@
 *
 ***********************************************************************/

-struct jit_fp {
-  char	kind;
-  char  subkind;
-  union {
-    struct {
-      int   displ;
-      char  reg1;
-      char  reg2;
-    } addr;
-    union {
-      double number;
-      long   split[sizeof(double) / sizeof(long)];
-    } imm;
-    struct {
-      struct jit_fp *lhs, *rhs;
-    } ops;
-  } d;
-};
-
-#ifdef jit_trunc
-
-enum {	JIT_NULL,						/* unused */
-        
-        JIT_CMP, JIT_FLOOR, JIT_CEIL, JIT_ROUND, JIT_TRUNC,	/* integer */
-	
-	JIT_XI, JIT_ADD, JIT_XR, JIT_SUB,			/* subkinds */
-	JIT_I,  JIT_MUL, JIT_R,  JIT_DIV,
-	JIT_INT,
-	
-	JIT_ABS, JIT_SIN, JIT_COS, JIT_TAN, JIT_ATN,		/* functions */
-	JIT_EXP, JIT_LOG, JIT_NEG, JIT_SQRT,
-	
-	JIT_OP,  JIT_FN,  JIT_LD,  JIT_IMM };			/* kinds */
-
-/* Declarations */
-
-static void _jit_emit(jit_state *, struct jit_fp *,
-		      int, int, int, int) JIT_UNUSED;
-static struct jit_fp *_jit_op(struct jit_fp *, int, 
-			      struct jit_fp *, struct jit_fp *) JIT_UNUSED;
-static struct jit_fp *_jit_ld(struct jit_fp *, int, 
-			      int, int) JIT_UNUSED;
-static struct jit_fp *_jit_fn(struct jit_fp *, int, 
-			      struct jit_fp *) JIT_UNUSED;
-static struct jit_fp *_jit_imm(struct jit_fp *, double) JIT_UNUSED;
-
-/* Internal function to walk the tree */
-
-void
-_jit_emit(jit_state *jit, struct jit_fp *head,
-	  int store_kind, int store1, int store2, int reg0)
-{
-#define _jit (*jit)
-  switch (head->kind) {
-    case JIT_OP:
-      _jit_emit(jit, head->d.ops.lhs, JIT_NULL, 0, 0, reg0);
-      _jit_emit(jit, head->d.ops.rhs, JIT_NULL, 0, 0, reg0 + 1);
-      switch (head->subkind) {
-        case JIT_ADD: jit_add_two(reg0); break;
-        case JIT_SUB: jit_sub_two(reg0); break;
-        case JIT_MUL: jit_mul_two(reg0); break;
-        case JIT_DIV: jit_div_two(reg0); break;
-      }
-      break;
-
-    case JIT_IMM:
-#ifdef JIT_LONG_IS_INT
-      jit_fpimm(reg0, head->d.imm.split[0], head->d.imm.split[1]);
-#else
-      jit_fpimm(reg0, head->d.imm.split[0]);
-#endif
-      break;
-
-    case JIT_FN:
-      _jit_emit(jit, head->d.ops.lhs, JIT_NULL, 0, 0, reg0);
-      switch (head->subkind) {
-	case JIT_ABS: jit_abs(reg0); break;
-	case JIT_NEG: jit_neg(reg0); break;
-#ifdef JIT_TRANSCENDENTAL
-	case JIT_SIN: jit_sin(reg0); break;
-	case JIT_SQRT: jit_sqrt(reg0); break;
-	case JIT_COS: jit_cos(reg0); break;
-	case JIT_TAN: jit_tan(reg0); break;
-	case JIT_ATN: jit_atn(reg0); break;
-	case JIT_EXP: jit_exp(reg0); break;
-	case JIT_LOG: jit_log(reg0); break;
-#endif
-      }
-      break;
-
-    case JIT_LD:
-      switch (head->subkind) {
-        case JIT_INT:    jit_exti_d(reg0, head->d.addr.reg1); break;
-        case JIT_XI:     jit_ldxi_f(reg0, head->d.addr.reg1, head->d.addr.displ); break;
-        case JIT_XR:     jit_ldxr_f(reg0, head->d.addr.reg1, head->d.addr.reg2);  break;
-        case JIT_XI | 1: jit_ldxi_d(reg0, head->d.addr.reg1, head->d.addr.displ); break;
-        case JIT_XR | 1: jit_ldxr_d(reg0, head->d.addr.reg1, head->d.addr.reg2);  break;
-#ifndef JIT_RZERO
-        case JIT_I:      jit_ldi_f(reg0, head->d.addr.displ); break;
-        case JIT_R:      jit_ldr_f(reg0, head->d.addr.reg1);  break;
-        case JIT_I | 1:  jit_ldi_d(reg0, head->d.addr.displ); break;
-        case JIT_R | 1:  jit_ldr_d(reg0, head->d.addr.reg1);  break;
-#endif
-      }
-      break;
-  }
-
-  switch (store_kind) {
-    case JIT_FLOOR:  jit_floor(store1, reg0);		 break;
-    case JIT_CEIL:   jit_ceil(store1, reg0);		 break;
-    case JIT_TRUNC:  jit_trunc(store1, reg0);		 break;
-    case JIT_ROUND:  jit_round(store1, reg0);		 break;
-    case JIT_CMP:    jit_cmp(store1, store2, reg0);	 break;
-    case JIT_XI:     jit_stxi_f(store2, store1, reg0);   break;
-    case JIT_XR:     jit_stxr_f(store2, store1, reg0);   break;
-    case JIT_XI | 1: jit_stxi_d(store2, store1, reg0);   break;
-    case JIT_XR | 1: jit_stxr_d(store2, store1, reg0);   break;
-#ifndef JIT_RZERO
-    case JIT_I:      jit_sti_f(store2, reg0);		 break;
-    case JIT_R:      jit_str_f(store2, reg0);		 break;
-    case JIT_I | 1:  jit_sti_d(store2, reg0);		 break;
-    case JIT_R | 1:  jit_str_d(store2, reg0);		 break;
-#endif
-    case JIT_NULL:   break;
-  }
-#undef _jit
-}
-
-/* Internal functions to build the tree */
-
-struct jit_fp *
-_jit_op(struct jit_fp *where, int which, 
-	struct jit_fp *op1, struct jit_fp *op2)
-{
-  where->kind = JIT_OP;
-  where->subkind = which;
-  where->d.ops.lhs = op1;
-  where->d.ops.rhs = op2;
-  return (where);
-}
-
-struct jit_fp *
-_jit_ld(struct jit_fp *where, int which, int op1, int op2)
-{
-  where->kind = JIT_LD;
-  where->subkind = which;
-  switch (which & ~1) {
-    case JIT_XI: where->d.addr.reg1 = op1;
-    case JIT_I:  where->d.addr.displ = op2;	break;
-    case JIT_XR: where->d.addr.reg2  = op2;
-    case JIT_INT:
-    case JIT_R:  where->d.addr.reg1  = op1;	break;
-  }
-  return (where);
-}
-
-struct jit_fp *
-_jit_fn(struct jit_fp *where, int which, struct jit_fp *op1)
-{
-  where->kind = JIT_FN;
-  where->subkind = which;
-  where->d.ops.lhs = op1;
-  return (where);
-}
-
-struct jit_fp *
-_jit_imm(struct jit_fp *where, double number)
-{
-  where->kind = JIT_IMM;
-  where->d.imm.number = number;
-  return (where);
-}
-
-#define jitfp_begin(buf)		(_jit.fp = (buf), --_jit.fp)
-#define jitfp_add(op1, op2)		_jit_op(++_jit.fp, JIT_ADD, (op1), (op2))
-#define jitfp_sub(op1, op2)		_jit_op(++_jit.fp, JIT_SUB, (op1), (op2))
-#define jitfp_mul(op1, op2)		_jit_op(++_jit.fp, JIT_MUL, (op1), (op2))
-#define jitfp_div(op1, op2)		_jit_op(++_jit.fp, JIT_DIV, (op1), (op2))
-#define jitfp_imm(imm)			_jit_imm(++_jit.fp, (imm))
-#define jitfp_exti_d(reg1)		_jit_ld(++_jit.fp, JIT_INT, (reg1), 0)
-#define jitfp_ldxi_f(reg1, imm)		_jit_ld(++_jit.fp, JIT_XI, (reg1), (long)(imm))
-#define jitfp_ldxr_f(reg1, reg2)	_jit_ld(++_jit.fp, JIT_XR, (reg1), (reg2))
-#define jitfp_ldxi_d(reg1, imm)		_jit_ld(++_jit.fp, JIT_XI | 1, (reg1), (long)(imm))
-#define jitfp_ldxr_d(reg1, reg2)	_jit_ld(++_jit.fp, JIT_XR | 1, (reg1), (reg2))
-#define jitfp_abs(op1)	 		_jit_fn(++_jit.fp, JIT_ABS, (op1))
-#define jitfp_sqrt(op1)	 		_jit_fn(++_jit.fp, JIT_SQRT, (op1))
-#define jitfp_neg(op1)			_jit_fn(++_jit.fp, JIT_NEG, (op1))
-#define jitfp_stxi_f(imm, reg1, op1)	_jit_emit(&_jit, (op1), JIT_XI, (reg1), (long)(imm), 0)
-#define jitfp_stxr_f(reg1, reg2, op1)	_jit_emit(&_jit, (op1), JIT_XR, (reg1), (reg2), 0)
-#define jitfp_stxi_d(imm, reg1, op1)	_jit_emit(&_jit, (op1), JIT_XI | 1, (reg1), (long)(imm), 0)
-#define jitfp_stxr_d(reg1, reg2, op1)	_jit_emit(&_jit, (op1), JIT_XR | 1, (reg1), (reg2), 0)
-#define jitfp_cmp(regle, regge, op1)	_jit_emit(&_jit, (op1), JIT_CMP, regle, regge, 0)
-#define jitfp_floor(reg1, op1)		_jit_emit(&_jit, (op1), JIT_FLOOR, reg1, 0, 0)
-#define jitfp_ceil(reg1, op1)		_jit_emit(&_jit, (op1), JIT_CEIL, reg1, 0, 0)
-#define jitfp_trunc(reg1, op1)		_jit_emit(&_jit, (op1), JIT_TRUNC, reg1, 0, 0)
-#define jitfp_round(reg1, op1)		_jit_emit(&_jit, (op1), JIT_ROUND, reg1, 0, 0)
-
-
-#ifdef JIT_TRANSCENDENTAL
-#define jitfp_sin(op1)			_jit_fn(++_jit.fp, JIT_SIN, (op1))
-#define jitfp_cos(op1)			_jit_fn(++_jit.fp, JIT_COS, (op1))
-#define jitfp_tan(op1)			_jit_fn(++_jit.fp, JIT_TAN, (op1))
-#define jitfp_atn(op1)			_jit_fn(++_jit.fp, JIT_ATN, (op1))
-#define jitfp_exp(op1)			_jit_fn(++_jit.fp, JIT_EXP, (op1))
-#define jitfp_log(op1)			_jit_fn(++_jit.fp, JIT_LOG, (op1))
-#endif
+#define JIT_FPR0			JIT_FPR(0)
+#define JIT_FPR1			JIT_FPR(1)
+#define JIT_FPR2			JIT_FPR(2)
+#define JIT_FPR3			JIT_FPR(3)
+#define JIT_FPR4			JIT_FPR(4)
+#define JIT_FPR5			JIT_FPR(5)

 #ifdef JIT_RZERO
-#define jitfp_ldi_f(imm)		_jit_ld(++_jit.fp, JIT_XI, JIT_RZERO, (long)(imm))
-#define jitfp_ldr_f(reg1)		_jit_ld(++_jit.fp, JIT_XR, JIT_RZERO, (reg1))
-#define jitfp_ldi_d(imm)		_jit_ld(++_jit.fp, JIT_XI | 1, JIT_RZERO, (long)(imm))
-#define jitfp_ldr_d(reg1)		_jit_ld(++_jit.fp, JIT_XR | 1, JIT_RZERO, (reg1))
-#define jitfp_sti_f(imm, op1)		_jit_emit(&_jit, (op1), JIT_XI, JIT_RZERO, (long)(imm), 0)
-#define jitfp_str_f(reg1, op1)		_jit_emit(&_jit, (op1), JIT_XR, JIT_RZERO, (reg1), 0)
-#define jitfp_sti_d(imm, op1)		_jit_emit(&_jit, (op1), JIT_XI | 1, JIT_RZERO, (long)(imm), 0)
-#define jitfp_str_d(reg1, op1)		_jit_emit(&_jit, (op1), JIT_XR | 1, JIT_RZERO, (reg1), 0)
-#else
-#define jitfp_ldi_f(imm)		_jit_ld(++_jit.fp, JIT_I, 0, (long)(imm))
-#define jitfp_ldr_f(reg1)		_jit_ld(++_jit.fp, JIT_R, (reg1), 0)
-#define jitfp_ldi_d(imm)		_jit_ld(++_jit.fp, JIT_I | 1, 0, (long)(imm))
-#define jitfp_ldr_d(reg1)		_jit_ld(++_jit.fp, JIT_R | 1, (reg1), 0)
-#define jitfp_sti_f(imm, op1)		_jit_emit(&_jit, (op1), JIT_I, 0, (long)(imm), 0)
-#define jitfp_str_f(reg1, op1)		_jit_emit(&_jit, (op1), JIT_R, 0, (reg1), 0)
-#define jitfp_sti_d(imm, op1)		_jit_emit(&_jit, (op1), JIT_I | 1, 0, (long)(imm), 0)
-#define jitfp_str_d(reg1, op1)		_jit_emit(&_jit, (op1), JIT_R | 1, 0, (reg1), 0)
+#ifndef jit_ldi_f
+#define jit_ldi_f(rd, is)               jit_ldxi_f((rd), JIT_RZERO, (is))
+#define jit_sti_f(id, rs)               jit_stxi_f((id), JIT_RZERO, (rs))
+#define jit_ldi_d(rd, is)               jit_ldxi_d((rd), JIT_RZERO, (is))
+#define jit_sti_d(id, rs)               jit_stxi_d((id), JIT_RZERO, (rs))
 #endif

-
+#ifndef jit_ldr_f
+#define jit_ldr_f(rd, rs)               jit_ldxr_f((rd), JIT_RZERO, (rs))
+#define jit_str_f(rd, rs)               jit_stxr_f((rd), JIT_RZERO, (rs))
+#define jit_ldr_d(rd, rs)               jit_ldxr_d((rd), JIT_RZERO, (rs))
+#define jit_str_d(rd, rs)               jit_stxr_d((rd), JIT_RZERO, (rs))
+#endif
+#endif
+
+#ifndef jit_addr_f
+#define jit_addr_f(rd,s1,s2)		jit_addr_d(rd,s1,s2)
+#define jit_subr_f(rd,s1,s2)		jit_subr_d(rd,s1,s2)
+#define jit_mulr_f(rd,s1,s2)		jit_mulr_d(rd,s1,s2)
+#define jit_divr_f(rd,s1,s2)		jit_divr_d(rd,s1,s2)
+#define jit_movr_f(rd,rs)		jit_movr_d(rd,rs)
+#define jit_abs_f(rd,rs)		jit_abs_d(rd,rs)
+#define jit_negr_f(rd,rs)		jit_negr_d(rd,rs)
+#define jit_sqrt_f(rd,rs)		jit_sqrt_d(rd,rs)
+#define jit_extr_f_d(rs, rd)
+#define jit_extr_d_f(rs, rd)
+#define jit_extr_i_f(rd, rs)		jit_extr_i_d(rd, rs)
+#define jit_roundr_f_i(rd, rs)		jit_roundr_d_i(rd, rs)
+#define jit_floorr_f_i(rd, rs)		jit_floorr_d_i(rd, rs)
+#define jit_ceilr_f_i(rd, rs)		jit_ceilr_d_i(rd, rs)
+#define jit_truncr_f_i(rd, rs)		jit_truncr_d_i(rd, rs)
+#define jit_ltr_f(d, s1, s2)		jit_ltr_d(d, s1, s2)
+#define jit_ler_f(d, s1, s2)		jit_ler_d(d, s1, s2)
+#define jit_eqr_f(d, s1, s2)		jit_eqr_d(d, s1, s2)
+#define jit_ner_f(d, s1, s2)		jit_ner_d(d, s1, s2)
+#define jit_ger_f(d, s1, s2)		jit_ger_d(d, s1, s2)
+#define jit_gtr_f(d, s1, s2)		jit_gtr_d(d, s1, s2)
+#define jit_unltr_f(d, s1, s2)		jit_unltr_d(d, s1, s2)
+#define jit_unler_f(d, s1, s2)		jit_unler_d(d, s1, s2)
+#define jit_uneqr_f(d, s1, s2)		jit_uneqr_d(d, s1, s2)
+#define jit_ltgtr_f(d, s1, s2)		jit_ltgtr_d(d, s1, s2)
+#define jit_unger_f(d, s1, s2)		jit_unger_d(d, s1, s2)
+#define jit_ungtr_f(d, s1, s2)		jit_ungtr_d(d, s1, s2)
+#define jit_ordr_f(d, s1, s2)		jit_ordr_d(d, s1, s2)
+#define jit_unordr_f(d, s1, s2)		jit_unordr_d(d, s1, s2)
+#define jit_retval_f(rs)		jit_retval_d(rs)
 #endif
--- a/lightning/funcs-common.h
+++ b/lightning/funcs-common.h
@ -45,4 +45,10 @@ jit_fail(const char *msg, const char *file, int line, const char *function)
  abort();
 }

+
+#ifndef jit_start_pfx
+#define jit_start_pfx()                 ( (jit_insn*)0x4)
+#define jit_end_pfx()                   ( (jit_insn*)0x0)
+#endif
+
 #endif /* __lightning_funcs_common_h */
--- a/lightning/i386/asm.h
+++ b/lightning/i386/asm.h
@ -43,6 +43,7 @@

 typedef _uc		jit_insn;

+#ifndef LIGHTNING_DEBUG
 #define _b00		0
 #define _b01		1
 #define _b10		2
@ -340,10 +341,10 @@ typedef _uc		jit_insn;

 #define CALLsm(D,B,I,S)			_O_r_X	(0xff	     ,_b010	,(int)(D),B,I,S		)

-#define CBW()				_O		(0x98								)
-#define CLC()				_O		(0xf8								)
-#define CLTD()				_O		(0x99								)
-#define CMC()				_O		(0xf5								)
+#define CBW_()				_O		(0x98								)
+#define CLC_()				_O		(0xf8								)
+#define CLTD_()				_O		(0x99								)
+#define CMC_()				_O		(0xf5								)


 #define CMPBrr(RS, RD)			_O_Mrm		(0x38		,_b11,_r1(RS),_r1(RD)				)
@ -365,7 +366,7 @@ typedef _uc		jit_insn;
 #define CMPLim(IM, MD, MB, MI, MS)	_O_r_X_L	(0x81		     ,_b111		,MD,MB,MI,MS	,IM	)


-#define CWD()				_O		(0x99								)
+#define CWD_()				_O		(0x99								)


 #define CMPXCHGBrr(RS,RD)		_OO_Mrm		(0x0fb0		,_b11,_r1(RS),_r1(RD)				)
@ -399,7 +400,7 @@ typedef _uc		jit_insn;


 #define ENTERii(W, B)			_O_W_B		(0xc8						  ,_su16(W),_su8(B))
-#define HLT()				_O		(0xf4								)
+#define HLT_()				_O		(0xf4								)


 #define IDIVBr(RS)			_O_Mrm		(0xf6		,_b11,_b111  ,_r1(RS)				)
@ -443,7 +444,7 @@ typedef _uc		jit_insn;
 #define INCLm(MD,MB,MI,MS)		_O_r_X		(0xff		     ,_b000		,MD,MB,MI,MS		)


-#define INVD()				_OO		(0x0f08								)
+#define INVD_()				_OO		(0x0f08								)
 #define INVLPGm(MD, MB, MI, MS)		_OO_r_X		(0x0f01		     ,_b111		,MD,MB,MI,MS		)


@ -523,9 +524,9 @@ typedef _uc		jit_insn;
 #define JMPsm(D,B,I,S)			_O_r_X	(0xff	     ,_b100	,(int)(D),B,I,S		)


-#define LAHF()				_O		(0x9f								)
+#define LAHF_()				_O		(0x9f								)
 #define LEALmr(MD, MB, MI, MS, RD)	_O_r_X		(0x8d		     ,_r4(RD)		,MD,MB,MI,MS		)
-#define LEAVE()				_O		(0xc9								)
+#define LEAVE_()			_O		(0xc9								)


 #define LMSWr(RS)			_OO_Mrm		(0x0f01		,_b11,_b110,_r4(RS)				)
@ -600,7 +601,7 @@ typedef _uc		jit_insn;
 #define NEGLm(MD,MB,MI,MS)		_O_r_X		(0xf7		     ,_b011		,MD,MB,MI,MS		)


-#define NOP()				_O		(0x90								)
+#define NOP_()				_O		(0x90								)


 #define NOTBr(RD)			_O_Mrm		(0xf6		,_b11,_b010  ,_r1(RD)				)
@ -639,11 +640,11 @@ typedef _uc		jit_insn;
 #define POPLm(MD,MB,MI,MS)		_O_r_X		(0x8f		     ,_b000		,MD,MB,MI,MS		)


-#define POPA()				_wO		(0x61								)
-#define POPAD()				_O		(0x61								)
+#define POPA_()				_wO		(0x61								)
+#define POPAD_()			_O		(0x61								)

-#define POPF()				_wO		(0x9d								)
-#define POPFD()				_O		(0x9d								)
+#define POPF_()				_wO		(0x9d								)
+#define POPFD_()			_O		(0x9d								)


 #define PUSHWr(R)			_wOr		(0x50,_r2(R)							)
@ -655,13 +656,13 @@ typedef _uc		jit_insn;
 #define PUSHLi(IM)			_Os_sL		(0x68							,IM	)


-#define PUSHA()				_wO		(0x60								)
-#define PUSHAD()			_O		(0x60								)
+#define PUSHA_()			_wO		(0x60								)
+#define PUSHAD_()			_O		(0x60								)

-#define PUSHF()				_O		(0x9c								)
-#define PUSHFD()			_wO		(0x9c								)
+#define PUSHF_()			_O		(0x9c								)
+#define PUSHFD_()			_wO		(0x9c								)

-#define RET()				_O		(0xc3								)
+#define RET_()				_O		(0xc3								)
 #define RETi(IM)			_O_W		(0xc2							,_su16(IM))


@ -721,7 +722,7 @@ typedef _uc		jit_insn;
 						JITFAIL		("source register must be CL"				) )


-#define SAHF()					_O	(0x9e								)
+#define SAHF_()					_O	(0x9e								)


 #define SALBir	SHLBir
@ -904,7 +905,7 @@ typedef _uc		jit_insn;
 						JITFAIL		("source register must be CL"				) )


-#define STC()				_O		(0xf9								)
+#define STC_()				_O		(0xf9								)


 #define SUBBrr(RS, RD)			_O_Mrm		(0x28		,_b11,_r1(RS),_r1(RD)				)
@ -985,8 +986,8 @@ typedef _uc		jit_insn;
 #define ESCmi(D,B,I,S,OP)	_O_r_X(0xd8|(OP >> 3), (OP & 7), D,B,I,S)
 #define ESCri(RD,OP)		_O_Mrm(0xd8|(OP >> 3), _b11, (OP & 7), RD)

-#define ESCrri(RS,RD,OP)	((RS) = _ST0 ? ESCri(RD,(OP|040))			\
-				 : (RD) = _ST0 ? ESCri(RS,OP)				\
+#define ESCrri(RS,RD,OP)	((RS) == _ST0 ? ESCri(RD,(OP|040))			\
+				 : (RD) == _ST0 ? ESCri(RS,OP)				\
 				 : JITFAIL ("coprocessor instruction without st0"))

 #define FLDSm(D,B,I,S)		ESCmi(D,B,I,S,010)     /* fld m32real  */
@ -1036,7 +1037,7 @@ typedef _uc		jit_insn;
 #define FNSTSWr(RD)		((RD == _AX || RD == _EAX) ? _OO (0xdfe0)		\
 				 : JITFAIL ("AX or EAX expected"))
 /* N byte NOPs */
-#define _NOPi(N)	(((  (N)    >= 8) ? (_jit_B(0x8d),_jit_B(0xb4),_jit_B(0x26),_jit_L(0x00),_jit_B(0x90)) : (void) 0), \
+#define NOPi(N)		(((  (N)    >= 8) ? (_jit_B(0x8d),_jit_B(0xb4),_jit_B(0x26),_jit_L(0x00),_jit_B(0x90)) : (void) 0), \
 			 (( ((N)&7) == 7) ? (_jit_B(0x8d),_jit_B(0xb4),_jit_B(0x26),_jit_L(0x00)) : \
 			  ( ((N)&7) == 6) ? (_jit_B(0x8d),_jit_B(0xb6),_jit_L(0x00)) : \
 			  ( ((N)&7) == 5) ? (_jit_B(0x90),_jit_B(0x8d),_jit_B(0x74),_jit_B(0x26),_jit_B(0x00)) : \
@ -1056,5 +1057,6 @@ typedef _uc		jit_insn;
 /* [2] "Intel Architecture Software Developer's Manual Volume 2: Instruction Set Reference",	*/
 /*     Intel Corporation 1997.									*/

+#endif
 #endif /* __lightning_asm_h */

--- a/lightning/i386/core.h
+++ b/lightning/i386/core.h
@ -34,16 +34,15 @@
 #ifndef __lightning_core_h
 #define __lightning_core_h

-#define JIT_R0			_EAX
-#define JIT_R1			_ECX
-#define JIT_R2			_EDX
-#define JIT_V0			_EBX
-#define JIT_V1			_ESI
-#define JIT_V2			_EDI
 #define JIT_FP			_EBP
 #define JIT_SP			_ESP
 #define JIT_RET			_EAX

+#define JIT_R_NUM		3
+#define JIT_V_NUM		3
+#define JIT_R(i)		(_EAX + (i))
+#define JIT_V(i)		((i) == 0 ? _EBX : _ESI + (i) - 1)
+
 struct jit_local_state {
  int	framesize;
  int	argssize;
@ -265,10 +264,13 @@ struct jit_local_state {

 /* The += allows for stack pollution */

-#define jitfp_prepare(ni,nf,nd) ((void) (_jitl.argssize += (ni) + (nf) + 2*(nd)))
+#define jit_prepare_i(ni)	(_jitl.argssize += (ni))
+#define jit_prepare_f(nf)	(_jitl.argssize += (nf))
+#define jit_prepare_d(nd)	(_jitl.argssize += 2 * (nd))
 #define jit_pusharg_i(rs)	PUSHLr(rs)
 #define jit_finish(sub)		(jit_calli((sub)), ADDLir(4 * _jitl.argssize, JIT_SP), _jitl.argssize = 0)
-#define jit_retval(rd)		jit_movr_i ((rd), _EAX)
+#define jit_finishr(reg)	(jit_callr((reg)), ADDLir(4 * _jitl.argssize, JIT_SP), _jitl.argssize = 0)
+#define jit_retval_i(rd)	jit_movr_i ((rd), _EAX)

 #define	jit_arg_c()		((_jitl.framesize += sizeof(int)) - sizeof(int))
 #define	jit_arg_uc()		((_jitl.framesize += sizeof(int)) - sizeof(int))
@ -289,6 +291,8 @@ struct jit_local_state {

 #define jit_movr_i(d, rs)	((rs) == (d) ? 0 : MOVLrr((rs), (d)))
 #define jit_movi_i(d, is)	((is) ? MOVLir((is), (d)) : XORLrr ((d), (d)) )
+#define jit_movi_p(d, is)	(MOVLir((is), (d)), _jit.x.pc)
+#define jit_patch_movi(pa,pv)   (*_PSL((pa) - 4) = _jit_SL((pv)))

 #define jit_ntoh_ui(d, rs)	jit_op_((d), (rs), BSWAPLr(d))
 #define jit_ntoh_us(d, rs)	jit_op_((d), (rs), RORWir(8, d))
@ -311,7 +315,7 @@ struct jit_local_state {
 #define jit_gei_i(d, rs, is)	jit_bool_i0((d), (rs), (is), SETGEr, SETNSr )
 #define jit_eqi_i(d, rs, is)	jit_bool_i0((d), (rs), (is), SETEr,  SETEr  )
 #define jit_nei_i(d, rs, is)	jit_bool_i0((d), (rs), (is), SETNEr, SETNEr )
-#define jit_lti_ui(d, rs, is)	jit_bool_i ((d), (rs), (is), SETB	    )
+#define jit_lti_ui(d, rs, is)	jit_bool_i ((d), (rs), (is), SETBr	    )
 #define jit_lei_ui(d, rs, is)	jit_bool_i0((d), (rs), (is), SETBEr, SETEr  )
 #define jit_gti_ui(d, rs, is)	jit_bool_i0((d), (rs), (is), SETAr,  SETNEr )
 #define jit_gei_ui(d, rs, is)	jit_bool_i0((d), (rs), (is), SETAEr, INCLr  )
@ -340,10 +344,10 @@ struct jit_local_state {
 #define jit_bgei_i(label, rs, is)	jit_bra_i0((rs), (is), JGEm(label,0,0,0), JNSm(label,0,0,0) )
 #define jit_beqi_i(label, rs, is)	jit_bra_i0((rs), (is), JEm(label, 0,0,0), JEm(label, 0,0,0) )
 #define jit_bnei_i(label, rs, is)	jit_bra_i0((rs), (is), JNEm(label,0,0,0), JNEm(label,0,0,0) )
-#define jit_blti_ui(label, rs, is)	jit_bra_i ((rs), (is), JLm(label, 0,0,0)		    )
-#define jit_blei_ui(label, rs, is)	jit_bra_i0((rs), (is), JLEm(label,0,0,0), JEm(label, 0,0,0) )
-#define jit_bgti_ui(label, rs, is)	jit_bra_i0((rs), (is), JGm(label, 0,0,0), JNEm(label,0,0,0) )
-#define jit_bgei_ui(label, rs, is)	jit_bra_i ((rs), (is), JGEm(label,0,0,0)		    )
+#define jit_blti_ui(label, rs, is)	jit_bra_i ((rs), (is), JBm(label, 0,0,0)		    )
+#define jit_blei_ui(label, rs, is)	jit_bra_i0((rs), (is), JBEm(label,0,0,0), JEm(label, 0,0,0) )
+#define jit_bgti_ui(label, rs, is)	jit_bra_i0((rs), (is), JAm(label, 0,0,0), JNEm(label,0,0,0) )
+#define jit_bgei_ui(label, rs, is)	jit_bra_i ((rs), (is), JAEm(label,0,0,0)		    )
 #define jit_boaddi_i(label, rs, is)	(ADDLir((is), (rs)), JOm(label,0,0,0), _jit.x.pc)
 #define jit_bosubi_i(label, rs, is)	(SUBLir((is), (rs)), JOm(label,0,0,0), _jit.x.pc)
 #define jit_boaddi_ui(label, rs, is)	(ADDLir((is), (rs)), JCm(label,0,0,0), _jit.x.pc)
@ -354,9 +358,10 @@ struct jit_local_state {

 #define jit_jmpi(label)		(JMPm( ((unsigned long) (label)),	0, 0, 0), _jit.x.pc)
 #define jit_calli(label)	(CALLm( ((unsigned long) (label)),	0, 0, 0), _jit.x.pc)
+#define jit_callr(reg)		(CALLsr(reg))
 #define jit_jmpr(reg)		JMPsr(reg)
-#define jit_patch(jump_pc)	(*_PSL((jump_pc) - 4) = _jit_SL(_jit.x.pc - (jump_pc)))
-#define jit_ret()		(POPLr(_EDI), POPLr(_ESI), POPLr(_EBX), POPLr(_EBP), RET())
+#define jit_patch_at(jump_pc,v)	(*_PSL((jump_pc) - 4) = _jit_SL((v) - (jump_pc)))
+#define jit_ret()		(POPLr(_EDI), POPLr(_ESI), POPLr(_EBX), POPLr(_EBP), RET_())

 /* Memory */
 #define jit_ldi_c(d, is)		MOVSBLmr((is), 0,    0,    0, (d))
@ -400,9 +405,9 @@ struct jit_local_state {
 #define jit_stxi_i(id, rd, rs)		MOVLrm((rs), (id), (rd), 0,    0)

 /* Extra */
-#define jit_nop()			NOP()
+#define jit_nop()			NOP_()

 #define _jit_alignment(pc, n)		(((pc ^ _MASK(4)) + 1) & _MASK(n))
-#define jit_align(n) 			_NOPi(_jit_alignment(_jit_UL(_jit.x.pc), (n)))
+#define jit_align(n) 			NOPi(_jit_alignment(_jit_UL(_jit.x.pc), (n)))

 #endif /* __lightning_core_h */
--- a/lightning/i386/fp.h
+++ b/lightning/i386/fp.h
@ -33,49 +33,48 @@
 #ifndef __lightning_asm_fp_h
 #define __lightning_asm_fp_h

-/* Actually, we should redesign the jitfp interface.  As a first step, I have
-   defined the macros for many x87 instructions, and I am using them here.
+/* We really must map the x87 stack onto a flat register file.  In practice,
+   we can provide something sensible and make it work on the x86 using the
+   stack like a file of eight registers.

-   In practice, we can provide something sensible and make it work on the x86
-   using the stack like a file of eight registers.  Then this awful stuff goes
-   away, and everything is "beautiful" as the rest of GNU lightning---and we'll
-   document it, promised.
-
-   Well, let's use six or seven registers so as to have some freedom
-   for floor, ceil, round, log, tan, atn and exp.
+   We use six or seven registers so as to have some freedom
+   for floor, ceil, round, (and log, tan, atn and exp).

   Not hard at all, basically play with FXCH.  FXCH is mostly free,
   so the generated code is not bad.  Of course we special case when one
   of the operands turns out to be ST0.

-   - binary ops:
+   Here are the macros that actually do the trick.  */

-	add FRR3 to FPR0
-		FADD ST0,ST3
+#define JIT_FPR_NUM	       6
+#define JIT_FPR(i)	       (i)

-	add FPR0 to FPR3
-		FADD ST3,ST0
+#define jit_fxch(rs, op)       (((rs) != 0 ? FXCHr(rs) : 0),   \
+                                op, ((rs) != 0 ? FXCHr(rs) : 0))

-	add FPR3 to FPR7 (I'm using nasm syntax here)
-		FXCH ST3
-		FADD ST7,ST0
-		FXCH ST3
+#define jit_fp_unary(rd, s1, op)                       \
+       ((rd) == (s1) ? jit_fxch ((rd), op)             \
+        : (rd) == 0 ? (FSTPr (0), FLDr ((s1)-1), op)   \
+        : (FLDr ((s1)), op, FSTPr ((rd))))

-   - stores:
+#define jit_fp_binary(rd, s1, s2, op, opr)             \
+       ((rd) == (s1) ?                                 \
+          ((s2) == 0 ? opr(0, (rd))                    \
+           : (s2) == (s1) ? jit_fxch((rd), op(0, 0))   \
+           : jit_fxch((rd), op((s2), 0)))              \
+        : (rd) == (s2) ? jit_fxch((s1), opr(0, (rd) == 0 ? (s1) : (rd)))       \
+        : (FLDr (s1), op(0, (s2)+1), FSTPr((rd)+1)))

-	store FPR3
+#define jit_addr_d(rd,s1,s2)    jit_fp_binary((rd),(s1),(s2),FADDrr,FADDrr)
+#define jit_subr_d(rd,s1,s2)    jit_fp_binary((rd),(s1),(s2),FSUBrr,FSUBRrr)
+#define jit_mulr_d(rd,s1,s2)    jit_fp_binary((rd),(s1),(s2),FMULrr,FMULrr)
+#define jit_divr_d(rd,s1,s2)    jit_fp_binary((rd),(s1),(s2),FDIVrr,FDIVRrr)

-		FXCH ST3
-		FST [FUBAR]
-		FXCH ST3
+#define jit_abs_d(rd,rs)       jit_fp_unary ((rd), (rs), _OO (0xd9e1))
+#define jit_negr_d(rd,rs)      jit_fp_unary ((rd), (rs), _OO (0xd9e0))
+#define jit_sqrt_d(rd,rs)      jit_fp_unary ((rd), (rs), _OO (0xd9fa))

-	store FPR0
-
-		FST [FUBAR]
-
-	(and similarly for other unary ops like FCHS or FABS)
-
-   - moves:
+/* - moves:

 	move FPR0 to FPR3
 		FST  ST3
@ -85,11 +84,16 @@
 		FST  ST3

 	move FPR3 to FPR1
-		FSTP ST1   Save old st0 into destination register
-		FLD  ST2   Stack is rotated, so FPRn becomes STn-1
-		FXCH ST1   Get back old st0
+                FLD  ST1
+                FST  ST4   Stack is rotated, so FPRn becomes STn+1 */

-   - loads:
+#define jit_movr_d(rd,s1)                              \
+       ((s1) == (rd) ? 0                               \
+        : (s1) == 0 ? FSTr ((rd))                      \
+        : (rd) == 0 ? (FXCHr ((s1)), FSTr ((s1)))      \
+        : (FLDr ((s1)), FSTr ((rd)+1)))
+
+/* - loads:

 	load into FPR0
 		FSTP ST0
@ -102,53 +106,82 @@

   (and similarly for immediates, using the stack) */

-#define jit_add_two(reg0)	FADDPr(1)
-#define jit_sub_two(reg0)	FSUBRPr(1)
-#define jit_mul_two(reg0)	FMULPr(1)
-#define jit_div_two(reg0)	FDIVRPr(1)
+#define jit_movi_f(rd,immf)                     \
+        (_O (0x68),                            \
+         *((float *) _jit.x.pc) = (float) immf, \
+         _jit.x.uc_pc += sizeof (float),       \
+        jit_ldr_f((rd), _ESP),                 \
+        ADDLir(4, _ESP))

-#define jit_abs(reg0)		_OO(0xd9e1)			/* fabs */
-#define jit_sqr(reg0)		FMULrr(0,0)
-#define jit_sqrt(reg0)		_OO(0xd9fa)			/* fsqrt */
+union jit_double_imm {
+  double d;
+  int i[2];
+};

-#define jit_exti_d(reg0, rs)	(PUSHLr((rs)), FILDLm(0, _ESP, 0, 0), POPLr((rs)))
-
-#define jit_neg(reg0)		_OO(0xd9e0)			/* fchs */
-
-#define jit_ldxr_f(reg0, s1, s2)	FLDSm(0, (s1), (s2), 1)
-#define jit_ldxi_f(reg0, rs, is)	FLDSm((is), (rs), 0, 0)
-#define jit_ldxr_f(reg0, s1, s2)	FLDSm(0, (s1), (s2), 1)
-#define jit_ldxi_d(reg0, rs, is)	FLDLm((is), (rs), 0, 0)
-#define jit_ldxr_d(reg0, s1, s2)	FLDLm(0, (s1), (s2), 1)
-#define jit_ldi_f(reg0, is)		FLDSm((is), 0,    0, 0)
-#define jit_ldr_f(reg0, rs)		FLDSm(0,    (rs), 0, 0)
-#define jit_ldi_d(reg0, is)		FLDLm((is), 0,    0, 0)
-#define jit_ldr_d(reg0, rs)		FLDLm(0,    (rs), 0, 0)
-#define jit_stxi_f(id, rd, reg0)	FSTPSm((id), (rd), 0, 0)
-#define jit_stxr_f(d1, d2, reg0)	FSTPSm(0, (d1), (d2), 1)
-#define jit_stxi_d(id, rd, reg0)	FSTPLm((id), (rd), 0, 0)
-#define jit_stxr_d(d1, d2, reg0)	FSTPLm(0, (d1), (d2), 1)
-#define jit_sti_f(id, reg0)		FSTPSm((id), 0,    0, 0)
-#define jit_str_f(rd, reg0)		FSTPSm(0,    (rd), 0, 0)
-#define jit_sti_d(id, reg0)		FSTPLm((id), 0,    0, 0)
-#define jit_str_d(rd, reg0)		FSTPLm(0,    (rd), 0, 0)
-
-#define jit_fpimm(reg0, first, second)	\
-	(PUSHLi(second),		\
-	PUSHLi(first),			\
-	FLDLm(0, _ESP, 0, 0),		\
+#define jit_movi_d(rd,immd)                                                            \
+        (_O (0x68),                                                                    \
+         _jit.x.uc_pc[4] = 0x68,                                                       \
+         ((union jit_double_imm *) (_jit.x.uc_pc + 5))->d = (double) immd,             \
+         *((int *) _jit.x.uc_pc) = ((union jit_double_imm *) (_jit.x.uc_pc + 5))->i[1],        \
+         _jit.x.uc_pc += 9,                                                            \
+        jit_ldr_d((rd), _ESP),                                                         \
        ADDLir(8, _ESP))

+#define jit_ldi_f(rd, is)                              \
+  ((rd) == 0 ? (FSTPr (0), FLDSm((is), 0, 0, 0))       \
+   : (FLDSm((is), 0, 0, 0), FSTPr ((rd) + 1)))
+
+#define jit_ldi_d(rd, is)                              \
+  ((rd) == 0 ? (FSTPr (0), FLDLm((is), 0, 0, 0))       \
+   : (FLDLm((is), 0, 0, 0), FSTPr ((rd) + 1)))
+
+#define jit_ldr_f(rd, rs)                              \
+  ((rd) == 0 ? (FSTPr (0), FLDSm(0, (rs), 0, 0))       \
+   : (FLDSm(0, (rs), 0, 0), FSTPr ((rd) + 1)))
+
+#define jit_ldr_d(rd, rs)                              \
+  ((rd) == 0 ? (FSTPr (0), FLDLm(0, (rs), 0, 0))       \
+   : (FLDLm(0, (rs), 0, 0), FSTPr ((rd) + 1)))
+
+#define jit_ldxi_f(rd, rs, is)                         \
+  ((rd) == 0 ? (FSTPr (0), FLDSm((is), (rs), 0, 0))    \
+   : (FLDSm((is), (rs), 0, 0), FSTPr ((rd) + 1)))
+
+#define jit_ldxi_d(rd, rs, is)                         \
+  ((rd) == 0 ? (FSTPr (0), FLDLm((is), (rs), 0, 0))    \
+   : (FLDLm((is), (rs), 0, 0), FSTPr ((rd) + 1)))
+
+#define jit_ldxr_f(rd, s1, s2)                         \
+  ((rd) == 0 ? (FSTPr (0), FLDSm(0, (s1), (s2), 1))    \
+   : (FLDSm(0, (s1), (s2), 1), FSTPr ((rd) + 1)))
+
+#define jit_ldxr_d(rd, s1, s2)                         \
+  ((rd) == 0 ? (FSTPr (0), FLDLm(0, (s1), (s2), 1))    \
+   : (FLDLm(0, (s1), (s2), 1), FSTPr ((rd) + 1)))
+
+#define jit_extr_i_d(rd, rs)   (PUSHLr((rs)),          \
+  ((rd) == 0 ? (FSTPr (0), FILDLm(0, _ESP, 0, 0))      \
+   : (FILDLm(0, _ESP, 0, 0), FSTPr ((rd) + 1))),       \
+  POPLr((rs)))
+
+#define jit_stxi_f(id, rd, rs) jit_fxch ((rs), FSTSm((id), (rd), 0, 0))
+#define jit_stxr_f(d1, d2, rs) jit_fxch ((rs), FSTSm(0, (d1), (d2), 1))
+#define jit_stxi_d(id, rd, rs) jit_fxch ((rs), FSTLm((id), (rd), 0, 0))
+#define jit_stxr_d(d1, d2, rs) jit_fxch ((rs), FSTLm(0, (d1), (d2), 1))
+#define jit_sti_f(id, rs)      jit_fxch ((rs), FSTSm((id), 0,    0, 0))
+#define jit_str_f(rd, rs)      jit_fxch ((rs), FSTSm(0,    (rd), 0, 0))
+#define jit_sti_d(id, rs)      jit_fxch ((rs), FSTLm((id), 0,    0, 0))
+#define jit_str_d(rd, rs)      jit_fxch ((rs), FSTLm(0,    (rd), 0, 0))

 /* Assume round to near mode */
-#define jit_floor(rd, reg0)	\
-	jit_floor2((rd), ((rd) == _EDX ? _EAX : _EDX))
+#define jit_floorr_d_i(rd, rs) \
+       (FLDr (rs), jit_floor2((rd), ((rd) == _EDX ? _EAX : _EDX)))

-#define jit_ceil(rd, reg0)	\
-	jit_ceil2((rd), ((rd) == _EDX ? _EAX : _EDX))
+#define jit_ceilr_d_i(rd, rs)  \
+       (FLDr (rs), jit_ceil2((rd), ((rd) == _EDX ? _EAX : _EDX)))

-#define jit_trunc(rd, reg0)	\
-	jit_trunc2((rd), ((rd) == _EDX ? _EAX : _EDX))
+#define jit_truncr_d_i(rd, rs) \
+       (FLDr (rs), jit_trunc2((rd), ((rd) == _EDX ? _EAX : _EDX)))

 #define jit_calc_diff(ofs)		\
 	FISTLm(ofs, _ESP, 0, 0),	\
@ -200,41 +233,102 @@
 	POPLr(aux))

 /* the easy one */
-#define jit_round(rd, reg0)		\
+#define jit_roundr_d_i(rd, rs)                         \
        (PUSHLr(_EAX),                                 \
-	FISTPLm(0, _ESP, 0, 0),		\
+        jit_fxch ((rs), FISTPLm(0, _ESP, 0, 0)),       \
 	POPLr((rd)))

-#define jit_cmp(le, ge, reg0) (					\
-	((le) == _EAX || (ge) == _EAX ? 0 : PUSHLr(_EAX)),	\
-	FCOMr(0),						\
-	FNSTSWr(_AX),						\
-	TESTBir(0x40, _AH),					\
-	MOVLir(0, (le)),					\
-	MOVLrr((le), (ge)),					\
-	JZSm(_jit.x.pc + 11, 0, 0, 0),				\
-	_OO(0xd9e4),			/* ftst */	/* 2 */ \
-	FNSTSWr(_AX),					/* 2 */	\
-	SAHF(),						/* 1 */ \
-	SETLEr( ((le) & 15) | 0x10),			/* 3 */ \
-	SETGEr( ((ge) & 15) | 0x10),			/* 3 */ \
-	((le) == _EAX || (ge) == _EAX ? ANDLir (1, _EAX) : POPLr(_EAX)) )
+#define jit_fp_test(d, s1, s2, n, _and, res)           \
+       (((s1) == 0 ? FUCOMr((s2)) : (FLDr((s1)), FUCOMPr((s2) + 1))),     \
+        ((d) != _EAX ? MOVLrr(_EAX, (d)) : 0),                 \
+        FNSTSWr(_EAX),                                         \
+        SHRLir(n, _EAX),                                       \
+        ((_and) ? ANDLir((_and), _EAX) : MOVLir(0, _EAX)),     \
+        res,                                                   \
+        ((d) != _EAX ? _O (0x90 + ((d) & 7)) : 0))     /* xchg */

-#define jitfp_getarg_f(ofs)             jitfp_ldxi_f(JIT_FP,(ofs))
-#define jitfp_getarg_d(ofs)             jitfp_ldxi_d(JIT_FP,(ofs))
-#define jitfp_pusharg_d(op1)            (jit_subi_i(JIT_SP,JIT_SP,sizeof(double)), jitfp_str_d(JIT_SP,(op1)))
-#define jitfp_pusharg_f(op1)            (jit_subi_i(JIT_SP,JIT_SP,sizeof(float)), jitfp_str_f(JIT_SP,(op1)))
-#define jitfp_retval(op1)               _jit_emit(&_jit, (op1), JIT_NULL, 0, 0, 0)
+#define jit_fp_btest(d, s1, s2, n, _and, cmp, res)             \
+       (((s1) == 0 ? FUCOMr((s2)) : (FLDr((s1)), FUCOMPr((s2) + 1))),    \
+        PUSHLr(_EAX),                                          \
+        FNSTSWr(_EAX),                                         \
+        SHRLir(n, _EAX),                                       \
+        ((_and) ? ANDLir ((_and), _EAX) : 0),                  \
+        ((cmp) ? CMPLir ((cmp), _AL) : 0),                     \
+        POPLr(_EAX),                                           \
+        res ((d), 0, 0, 0))

-#define JIT_TRANSCENDENTAL
+#define jit_nothing_needed(x)

-#define jit_sin(reg0)		_OO(0xd9fe)			/* fsin */
-#define jit_cos(reg0)		_OO(0xd9ff)			/* fcos */
-#define jit_tan(reg0)		(_OO(0xd9f2), 			/* fptan */ \
+/* After FNSTSW we have 1 if <, 40 if =, 0 if >, 45 if unordered.  Here
+   is how to map the values of the status word's high byte to the
+   conditions.
+
+         <     =     >     unord    valid values    condition
+  gt     no    no    yes   no       0               STSW & 45 == 0
+  lt     yes   no    no    no       1               STSW & 45 == 1
+  eq     no    yes   no    no       40              STSW & 45 == 40
+  unord  no    no    no    yes      45              bit 2 == 1
+
+  ge     no    yes   no    no       0, 40           bit 0 == 0
+  unlt   yes   no    no    yes      1, 45           bit 0 == 1
+  ltgt   yes   no    yes   no       0, 1            bit 6 == 0
+  uneq   no    yes   no    yes      40, 45          bit 6 == 1
+  le     yes   yes   no    no       1, 40           odd parity for STSW & 41
+  ungt   no    no    yes   yes      0, 45           even parity for STSW & 41
+
+  unle   yes   yes   no    yes      1, 40, 45       STSW & 45 != 0
+  unge   no    yes   yes   yes      0, 40, 45       STSW & 45 != 1
+  ne     yes   no    yes   yes      0, 1, 45        STSW & 45 != 40
+  ord    yes   yes   yes   no       0, 1, 40        bit 2 == 0
+
+  lt, le, ungt, unge are actually computed as gt, ge, unlt, unle with
+  the operands swapped; it is more efficient this way.  */
+
+#define jit_gtr_d(d, s1, s2)            jit_fp_test((d), (s1), (s2), 8, 0x45, SETZr (_AL))
+#define jit_ger_d(d, s1, s2)            jit_fp_test((d), (s1), (s2), 9, 0, SBBBir (-1, _AL))
+#define jit_unler_d(d, s1, s2)          jit_fp_test((d), (s1), (s2), 8, 0x45, SETNZr (_AL))
+#define jit_unltr_d(d, s1, s2)          jit_fp_test((d), (s1), (s2), 9, 0, ADCBir (0, _AL))
+#define jit_ltr_d(d, s1, s2)            jit_fp_test((d), (s2), (s1), 8, 0x45, SETZr (_AL))
+#define jit_ler_d(d, s1, s2)            jit_fp_test((d), (s2), (s1), 9, 0, SBBBir (-1, _AL))
+#define jit_unger_d(d, s1, s2)          jit_fp_test((d), (s2), (s1), 8, 0x45, SETNZr (_AL))
+#define jit_ungtr_d(d, s1, s2)          jit_fp_test((d), (s2), (s1), 9, 0, ADCBir (0, _AL))
+#define jit_eqr_d(d, s1, s2)            jit_fp_test((d), (s1), (s2), 8, 0x45, (CMPBir (0x40, _AL), SETEr (_AL)))
+#define jit_ner_d(d, s1, s2)            jit_fp_test((d), (s1), (s2), 8, 0x45, (CMPBir (0x40, _AL), SETNEr (_AL)))
+#define jit_ltgtr_d(d, s1, s2)          jit_fp_test((d), (s1), (s2), 15, 0, SBBBir (-1, _AL))
+#define jit_uneqr_d(d, s1, s2)          jit_fp_test((d), (s1), (s2), 15, 0, ADCBir (0, _AL))
+#define jit_ordr_d(d, s1, s2)           jit_fp_test((d), (s1), (s2), 11, 0, SBBBir (-1, _AL))
+#define jit_unordr_d(d, s1, s2)         jit_fp_test((d), (s1), (s2), 11, 0, ADCBir (0, _AL))
+
+#define jit_bgtr_d(d, s1, s2)           jit_fp_btest((d), (s1), (s2), 8, 0x45, 0, JZm)
+#define jit_bger_d(d, s1, s2)           jit_fp_btest((d), (s1), (s2), 9, 0, 0, JNCm)
+#define jit_bunler_d(d, s1, s2)         jit_fp_btest((d), (s1), (s2), 8, 0x45, 0, JNZm)
+#define jit_bunltr_d(d, s1, s2)         jit_fp_btest((d), (s1), (s2), 9, 0, 0, JCm)
+#define jit_bltr_d(d, s1, s2)           jit_fp_btest((d), (s2), (s1), 8, 0x45, 0, JZm)
+#define jit_bler_d(d, s1, s2)           jit_fp_btest((d), (s2), (s1), 9, 0, 0, JNCm)
+#define jit_bunger_d(d, s1, s2)         jit_fp_btest((d), (s2), (s1), 8, 0x45, 0, JNZm)
+#define jit_bungtr_d(d, s1, s2)         jit_fp_btest((d), (s2), (s1), 9, 0, 0, JCm)
+#define jit_beqr_d(d, s1, s2)           jit_fp_btest((d), (s1), (s2), 8, 0x45, 0x40, JZm)
+#define jit_bner_d(d, s1, s2)           jit_fp_btest((d), (s1), (s2), 8, 0x45, 0x40, JNZm)
+#define jit_bltgtr_d(d, s1, s2)         jit_fp_btest((d), (s1), (s2), 15, 0, 0, JNCm)
+#define jit_buneqr_d(d, s1, s2)         jit_fp_btest((d), (s1), (s2), 15, 0, 0, JCm)
+#define jit_bordr_d(d, s1, s2)          jit_fp_btest((d), (s1), (s2), 11, 0, 0, JNCm)
+#define jit_bunordr_d(d, s1, s2)        jit_fp_btest((d), (s1), (s2), 11, 0, 0, JCm)
+
+#define jit_getarg_f(rd, ofs)        jit_ldxi_f((rd), JIT_FP,(ofs))
+#define jit_getarg_d(rd, ofs)        jit_ldxi_d((rd), JIT_FP,(ofs))
+#define jit_pusharg_d(rs)            (jit_subi_i(JIT_SP,JIT_SP,sizeof(double)), jit_str_d(JIT_SP,(rs)))
+#define jit_pusharg_f(rs)            (jit_subi_i(JIT_SP,JIT_SP,sizeof(float)), jit_str_f(JIT_SP,(rs)))
+#define jit_retval_d(op1)            jit_movr_d(0, (op1))
+
+
+#if 0
+#define jit_sin()	_OO(0xd9fe)			/* fsin */
+#define jit_cos()	_OO(0xd9ff)			/* fcos */
+#define jit_tan()	(_OO(0xd9f2), 			/* fptan */ \
 			 FSTPr(0))			/* fstp st */
-#define jit_atn(reg0)		(_OO(0xd9e8), 			/* fld1 */ \
+#define jit_atn()	(_OO(0xd9e8), 			/* fld1 */ \
 			 _OO(0xd9f3))			/* fpatan */
-#define jit_exp(reg0)		(_OO(0xd9ea), 			/* fldl2e */ \
+#define jit_exp()	(_OO(0xd9ea), 			/* fldl2e */ \
 			 FMULPr(1), 			/* fmulp */ \
 			 _OO(0xd9c0),			/* fld st */ \
 			 _OO(0xd9fc),		 	/* frndint */ \
@ -245,8 +339,9 @@
 			 _OO(0xdec1), 			/* faddp */ \
 			 _OO(0xd9fd), 			/* fscale */ \
 			 FSTPr(1))			/* fstp st(1) */
-#define jit_log(reg0)		(_OO(0xd9ed), 			/* fldln2 */ \
+#define jit_log()	(_OO(0xd9ed), 			/* fldln2 */ \
 			 FXCHr(1), 			/* fxch st(1) */ \
 			 _OO(0xd9f1))			/* fyl2x */
+#endif

 #endif /* __lightning_asm_h */
--- a/lightning/i386/funcs.h
+++ b/lightning/i386/funcs.h
@ -34,6 +34,57 @@
 #ifndef __lightning_funcs_h
 #define __lightning_funcs_h

-#define jit_flush_code(dest, end)
+#ifdef __linux__
+#include <unistd.h>
+#endif
+
+static void
+jit_flush_code(void *dest, void *end)
+{
+  /* On the x86, the PROT_EXEC bits are not handled by the MMU.
+     However, the kernel can emulate this by setting the code
+     segment's limit to the end address of the highest page
+     whose PROT_EXEC bit is set.
+
+     Linux kernels that do so and that disable by default the
+     execution of the data and stack segment are becoming more
+     and more common (Fedora, for example), so we implement our
+     jit_flush_code as an mprotect.  */
+#ifdef __linux__
+  static unsigned long prev_page = 0, prev_length = 0;
+  int page, length;
+#ifdef PAGESIZE
+  const int page_size = PAGESIZE;
+#else
+  static int page_size = -1;
+  if (page_size == -1)
+    page_size = sysconf (_SC_PAGESIZE);
+#endif
+
+  page = (long) dest & ~(page_size - 1);
+  length = ((char *) end - (char *) page + page_size - 1) & ~(page_size - 1);
+
+  /* Simple-minded attempt at optimizing the common case where a single
+     chunk of memory is used to compile multiple functions.  */
+  if (page >= prev_page && page + length <= prev_page + prev_length)
+    return;
+
+  mprotect ((void *) page, length, PROT_READ | PROT_WRITE | PROT_EXEC);
+
+  /* See if we can extend the previously mprotect'ed memory area towards
+     higher addresses: the starting address remains the same as before.  */
+  if (page >= prev_page && page <= prev_page + prev_length)
+    prev_length = page + length - prev_page;
+
+  /* See if we can extend the previously mprotect'ed memory area towards
+     lower addresses: the highest address remains the same as before.  */
+  else if (page < prev_page && page + length <= prev_page + prev_length)
+    prev_length += prev_page - page, prev_page = page;
+
+  /* Nothing to do, replace the area.  */
+  else
+    prev_page = page, prev_length = length;
+#endif
+}

 #endif /* __lightning_funcs_h */
--- a/lightning/ppc/asm.h
+++ b/lightning/ppc/asm.h
@ -61,6 +61,7 @@

 typedef unsigned int jit_insn;

+#ifndef LIGHTNING_DEBUG
 #define _cr0	0
 #define _cr1	1
 #define _cr2	2
@ -81,9 +82,9 @@ typedef unsigned int jit_insn;

 /* primitive instruction forms [1, Section A.4] */

-#define _FB(  OP,         BD,AA,LK )	_jit_I((_u6(OP)<<26)|                                            _d26(BD)|     (_u1(AA)<<1)|_u1(LK))
+#define _FB(  OP,         BD,AA,LK )    (_jit_I_noinc((_u6(OP)<<26)|                                            _d26(BD)|     (_u1(AA)<<1)|_u1(LK)), _jit.x.pc++, 0)
 #define _FBA( OP,         BD,AA,LK )	_jit_I((_u6(OP)<<26)|                                           (_u26(BD)&~3)| (_u1(AA)<<1)|_u1(LK))
-#define _BB(   OP,BO,BI,   BD,AA,LK )  	_jit_I((_u6(OP)<<26)|(_u5(BO)<<21)|(_u5(BI)<<16)|                _d16(BD)|     (_u1(AA)<<1)|_u1(LK))
+#define _BB(   OP,BO,BI,   BD,AA,LK )   (_jit_I_noinc((_u6(OP)<<26)|(_u5(BO)<<21)|(_u5(BI)<<16)|                _d16(BD)|     (_u1(AA)<<1)|_u1(LK)), _jit.x.pc++, 0)
 #define _D(   OP,RD,RA,         DD )  	_jit_I((_u6(OP)<<26)|(_u5(RD)<<21)|(_u5(RA)<<16)|                _s16(DD)                          )
 #define _Du(  OP,RD,RA,         DD )  	_jit_I((_u6(OP)<<26)|(_u5(RD)<<21)|(_u5(RA)<<16)|                _u16(DD)                          )
 #define _Ds(  OP,RD,RA,         DD )  	_jit_I((_u6(OP)<<26)|(_u5(RD)<<21)|(_u5(RA)<<16)|                _su16(DD)                         )
@ -93,6 +94,7 @@ typedef unsigned int jit_insn;
 #define _XO(  OP,RD,RA,RB,OE,XO,RC )  	_jit_I((_u6(OP)<<26)|(_u5(RD)<<21)|(_u5(RA)<<16)|( _u5(RB)<<11)|(_u1(OE)<<10)|( _u9(XO)<<1)|_u1(RC))
 #define _M(   OP,RS,RA,SH,MB,ME,RC )  	_jit_I((_u6(OP)<<26)|(_u5(RS)<<21)|(_u5(RA)<<16)|( _u5(SH)<<11)|(_u5(MB)<< 6)|( _u5(ME)<<1)|_u1(RC))

+
 /* special purpose registers (form XFX) [1, Section 8.2, page 8-138] */

 #define SPR_LR		((8<<5)|(0))
@ -313,10 +315,10 @@ typedef unsigned int jit_insn;
 #define MOVEIri(R,I)			(_siP(16,I) ? LIri(R,I) :	\
 					MOVEIri2(R, _HI(I), _LO(I)) )

-#define SUBIrri(RD,RA,IM)		ADDIrri(RD,RA,-_jit_L((IM)))	/* [1, Section F.2.1] */
-#define SUBISrri(RD,RA,IM)		ADDISrri(RD,RA,-_jit_L((IM)))
-#define SUBICrri(RD,RA,IM)		ADDICrri(RD,RA,-_jit_L((IM)))
-#define SUBIC_rri(RD,RA,IM)		ADDIC_rri(RD,RA,-_jit_L((IM)))
+#define SUBIrri(RD,RA,IM)		ADDIrri(RD,RA,-_LO((IM)))	/* [1, Section F.2.1] */
+#define SUBISrri(RD,RA,IM)		ADDISrri(RD,RA,-_LO((IM)))
+#define SUBICrri(RD,RA,IM)		ADDICrri(RD,RA,-_LO((IM)))
+#define SUBIC_rri(RD,RA,IM)		ADDIC_rri(RD,RA,-_LO((IM)))

 #define SUBrrr(RD,RA,RB)		SUBFrrr(RD,RB,RA)	/* [1, Section F.2.2] */
 #define SUBOrrr(RD,RA,RB)		SUBFOrrr(RD,RB,RA)
@ -350,17 +352,21 @@ typedef unsigned int jit_insn;
 #define CLRRWIrri(RA,RS,N)		RLWINMrriii(RA, RS,            0,	0,    31-(N))
 #define CLRLSLWIrrii(RA,RS,B,N)		RLWINMrriii(RA, RS,            N, (B)-(N),    31-(N))

+
 /* 9 below inverts the branch condition and the branch prediction.
- * This has an incestuous knowledge of the fact that register 26
- * is used as auxiliary!!! */
+ * This has an incestuous knowledge of JIT_AUX */
 #define BC_EXT(A, C, D)  (_siP(16, _jit_UL(D)-_jit_UL(_jit.x.pc)) \
  ? BCiii((A), (C), (D)) \
-  : (BCiii((A)^9, (C), _jit.x.pc+5), LISri(26,_HI(D)), ORIrri(26,26,_LO(D)), \
-     MTLRr(26), BLR() ))
+  : (BCiii((A)^9, (C), _jit.x.pc+5), \
+     LISri(JIT_AUX,_HI(D)), \
+     ORIrri(JIT_AUX,JIT_AUX,_LO(D)), \
+     MTLRr(JIT_AUX), BLR() ))

 #define B_EXT(D)         (_siP(16, _jit_UL(D)-_jit_UL(_jit.x.pc)) \
  ? Bi((D)) \
-  : (LISri(26,_HI(D)), ORIrri(26,26,_LO(D)), MTLRr(26), BLR()) )
+  : (LISri(JIT_AUX,_HI(D)), \
+     ORIrri(JIT_AUX,JIT_AUX,_LO(D)), \
+     MTLRr(JIT_AUX), BLR()) )

 #define BTii(C,D)			BC_EXT(12, C, D)		/* [1, Table F-5] */
 #define BFii(C,D)			BC_EXT( 4, C, D)
@ -379,7 +385,7 @@ typedef unsigned int jit_insn;
 		

 #define BLTLRi(CR)			BCLRii(12, ((CR)<<2)+0)	/* [1, Table F-10] */
-#define BLELRi(CR)			BCLRii( 4  ((CR)<<2)+1)
+#define BLELRi(CR)			BCLRii( 4, ((CR)<<2)+1)
 #define BEQLRi(CR)			BCLRii(12, ((CR)<<2)+2)
 #define BGELRi(CR)			BCLRii( 4, ((CR)<<2)+0)
 #define BGTLRi(CR)			BCLRii(12, ((CR)<<2)+1)
@ -405,7 +411,7 @@ typedef unsigned int jit_insn;
 #define BNULRLi(CR)			BCLRLii( 4, ((CR)<<2)+3)
 		
 #define BLTCTRi(CR)			BCCTRii(12, ((CR)<<2)+0)	/* [1, Table F-10] */
-#define BLECTRi(CR)			BCCTRii( 4  ((CR)<<2)+1)
+#define BLECTRi(CR)			BCCTRii( 4, ((CR)<<2)+1)
 #define BEQCTRi(CR)			BCCTRii(12, ((CR)<<2)+2)
 #define BGECTRi(CR)			BCCTRii( 4, ((CR)<<2)+0)
 #define BGTCTRi(CR)			BCCTRii(12, ((CR)<<2)+1)
@ -511,7 +517,7 @@ typedef unsigned int jit_insn;
 #define BNUi(D)				BNUii(0,D)

 #define BLTLii(C,D)			BCLiii(12, ((C)<<2)+0, D)	/* [1, Table F-??] */
-#define BLELii(C,D)			BCLiii( 4  ((C)<<2)+1, D)
+#define BLELii(C,D)			BCLiii( 4, ((C)<<2)+1, D)
 #define BEQLii(C,D)			BCLiii(12, ((C)<<2)+2, D)
 #define BGELii(C,D)			BCLiii( 4, ((C)<<2)+0, D)
 #define BGTLii(C,D)			BCLiii(12, ((C)<<2)+1, D)
@ -586,7 +592,50 @@ typedef unsigned int jit_insn;
 #define _LO(I)          (_jit_UL(I) & _MASK(16))
 #define _HI(I)          (_jit_UL(I) >>     (16))

+#define _A(OP,RD,RA,RB,RC,XO,RCx)    _jit_I((_u6(OP)<<26)|(_u5(RD)<<21)|(_u5(RA)<<16)|( _u5(RB)<<11)|_u5(RC)<<6|(_u5(XO)<<1)|_u1(RCx))

+#define LFDrri(RD,RA,imm)       _D(50,RD,RA,imm)
+#define LFDUrri(RD,RA,imm)      _D(51,RD,RA,imm)
+#define LFDUxrrr(RD,RA,RB)      _X(31,RD,RA,RB,631,0)
+#define LFDxrrr(RD,RA,RB)       _X(31,RD,RA,RB,599,0)
+
+#define LFSrri(RD,RA,imm)       _D(48,RD,RA,imm)
+#define LFSUrri(RD,RA,imm)      _D(49,RD,RA,imm)
+#define LFSUxrrr(RD,RA,RB)      _X(31,RD,RA,RB,567,0)
+#define LFSxrrr(RD,RA,RB)       _X(31,RD,RA,RB,535,0)
+
+#define STFDrri(RS,RA,imm)      _D(54,RS,RA,imm)
+#define STFDUrri(RS,RA,imm)     _D(55,RS,RA,imm)
+#define STFDUxrrr(RS,RA,RB)     _X(31,RS,RA,RB,759,0)
+#define STFDxrrr(RS,RA,RB)      _X(31,RS,RA,RB,727,0)
+
+#define STFSrri(RS,RA,imm)      _D(52,RS,RA,imm)
+#define STFSUrri(RS,RA,imm)     _D(53,RS,RA,imm)
+#define STFSUxrrr(RS,RA,RB)     _X(31,RS,RA,RB,695,0)
+#define STFSxrrr(RS,RA,RB)      _X(31,RS,RA,RB,663,0)
+#define STFIWXrrr(RS,RA,RB)     _X(31,RS,RA,RB,983,0)
+
+#define FADDDrrr(RD,RA,RB)       _A(63,RD,RA,RB,0,21,0)
+#define FADDSrrr(RD,RA,RB)       _A(59,RD,RA,RB,0,21,0)
+#define FSUBDrrr(RD,RA,RB)       _A(63,RD,RA,RB,0,20,0)
+#define FSUBSrrr(RD,RA,RB)       _A(59,RD,RA,RB,0,20,0)
+#define FMULDrrr(RD,RA,RC)       _A(63,RD,RA,0,RC,25,0)
+#define FMULSrrr(RD,RA,RC)       _A(59,RD,RA,0,RC,25,0)
+#define FDIVDrrr(RD,RA,RB)       _A(63,RD,RA,RB,0,18,0)
+#define FDIVSrrr(RD,RA,RB)       _A(59,RD,RA,RB,0,25,0)
+#define FSQRTDrr(RD,RB)          _A(63,RD,0,RB,0,22,0)
+#define FSQRTSrr(RD,RB)          _A(59,RD,0,RB,0,22,0)
+#define FSELrrrr(RD,RA,RB,RC)    _A(63,RD,RA,RB,RC,23,0)
+#define FCTIWrr(RD,RB)           _X(63,RD,0,RB,14,0)
+#define FCTIWZrr(RD,RB)          _X(63,RD,0,RB,15,0)
+#define FRSPrr(RD,RB)            _X(63,RD,0,RB,12,0)
+#define FABSrr(RD,RB)            _X(63,RD,0,RB,264,0)
+#define FNABSrr(RD,RB)           _X(63,RD,0,RB,136,0)
+#define FNEGrr(RD,RB)            _X(63,RD,0,RB,40,0)
+#define FMOVErr(RD,RB)           _X(63,RD,0,RB,72,0)
+#define FCMPOrrr(CR,RA,RB)       _X(63,_u3((CR)<<2),RA,RB,32,0)
+#define FCMPUrrr(CR,RA,RB)       _X(63,_u3((CR)<<2),RA,RB,0,0)
+#define MTFSFIri(CR,IMM)          _X(63,_u5((CR)<<2),0,_u5((IMM)<<1),134,0)

 /*** References:
 *
@ -594,4 +643,5 @@ typedef unsigned int jit_insn;
 */


+#endif
 #endif /* __ccg_asm_ppc_h */
--- a/lightning/ppc/core.h
+++ b/lightning/ppc/core.h
@ -36,20 +36,24 @@
 #define __lightning_core_h

 struct jit_local_state {
-  int	nextarg_put;   /* Next r3-r8 reg. to be written */
-  int	nextarg_putfp; /* Next r3-r8 reg. to be written */
-  int	nextarg_get;   /* Next r20-r25 reg. to be read */
+   int	nextarg_puti;  /* number of integer args */
+   int	nextarg_putf;  /* number of float args   */
+   int	nextarg_putd;  /* number of double args  */
+   int	nextarg_geti;  /* Next r20-r25 reg. to be read */
+   int	nextarg_getd;  /* The FP args are picked up from FPR1 -> FPR10 */
+   int  nbArgs;        /* Number of arguments for the prolog */
 };

 #define JIT_SP			1
 #define JIT_RET			3
-#define JIT_R0			9
-#define JIT_R1			10
-#define JIT_R2			30  /* using r8 would limit argument passing */
-#define JIT_V0			29
-#define JIT_V1			28
-#define JIT_V2			27
-#define JIT_AUX			26  /* for 32-bit operands & shift counts */
+#define JIT_R_NUM		3
+#define JIT_V_NUM		7
+#define JIT_R(i)		(9+(i))
+#define JIT_V(i)		(31-(i))
+#define JIT_AUX			JIT_V(JIT_V_NUM)  /* for 32-bit operands & shift counts */
+
+#define jit_pfx_start()   (_jit.jitl.trampolines)
+#define jit_pfx_end()     (_jit.jitl.free)

 /* If possible, use the `small' instruction (rd, rs, imm)
 * else load imm into r26 and use the `big' instruction (rd, rs, r26)
@ -58,6 +62,9 @@ struct jit_local_state {
 #define jit_chk_imu(imm, small, big)		(_uiP(16,(imm)) ? (small) : (MOVEIri(JIT_AUX, imm),  (big)) )
 #define jit_chk_imu15(imm, small, big)		(_uiP(15,(imm)) ? (small) : (MOVEIri(JIT_AUX, imm),  (big)) )

+#define jit_big_ims(imm, big)	               (MOVEIri(JIT_AUX, imm),  (big))
+#define jit_big_imu(imm, big)	               (MOVEIri(JIT_AUX, imm),  (big))
+
 /* Helper macros for branches */
 #define jit_s_brai(rs, is, jmp)			(jit_chk_ims (is, CMPWIri(rs, is), CMPWrr(rs, JIT_AUX)),   jmp, _jit.x.pc)
 #define jit_s_brar(s1, s2, jmp)			(		  CMPWrr(s1, s2), 		           jmp, _jit.x.pc)
@ -87,38 +94,48 @@ struct jit_local_state {
 						MULLWrrr(31, 31, JIT_AUX), SUBrrr((rs), (rs), JIT_AUX), \
 						MFLRr(31))

-/* Emit a 2-instruction MOVEI, even if a 1-instruction one is possible
- * (it is a rare case for branches, and a fixed sequence of instructions
- * is easier to patch). */
-#define jit_movei(reg, imm)			(LISri(reg,_HI(imm)), ORIrri((reg),(reg),_LO(imm)))
-
 /* Patch a movei instruction made of a LIS at lis_pc and an ORI at ori_pc. */
-#define jit_patch_movei(lis_pc, ori_pc)					\
-	(*(lis_pc) &= ~_MASK(16), *lis_pc |= _HI(_jit.x.pc),		\
-	 *(ori_pc) &= ~_MASK(16), *ori_pc |= _LO(_jit.x.pc))		\
+#define jit_patch_movei(lis_pc, ori_pc, dest)			\
+	(*(lis_pc) &= ~_MASK(16), *(lis_pc) |= _HI(dest),		\
+	 *(ori_pc) &= ~_MASK(16), *(ori_pc) |= _LO(dest))		\

 /* Patch a branch instruction */
-#define jit_patch_branch(jump_pc)				\
+#define jit_patch_branch(jump_pc,pv)				\
 	(*(jump_pc) &= ~_MASK(16) | 3,				\
-	 *(jump_pc) |= (_jit_UL(_jit.x.pc) - _jit_UL(jump_pc)) & _MASK(16))
+	 *(jump_pc) |= (_jit_UL(pv) - _jit_UL(jump_pc)) & _MASK(16))

+#define jit_patch_ucbranch(jump_pc,pv)                          \
+         (*(jump_pc) &= ~_MASK(26) | 3,                         \
+         (*(jump_pc) |= (_jit_UL((pv)) - _jit_UL(jump_pc)) & _MASK(26)))
+
+#define _jit_b_encoding		(18 << 26)
 #define _jit_blr_encoding	((19 << 26) | (20 << 21) | (00 << 16) | (00 << 11) | (16 << 1))
+#define _jit_is_ucbranch(a)     (((*(a) & (63<<26)) == _jit_b_encoding))

-#define jit_patch(jump_pc) (					\
+#define jit_patch_at(jump_pc, value) (				\
 	((*(jump_pc - 1) & ~1) == _jit_blr_encoding)		\
-	? jit_patch_movei(((jump_pc) - 4), ((jump_pc) - 3))	\
-	: jit_patch_branch((jump_pc) - 1))
+	  ? jit_patch_movei(((jump_pc) - 4), ((jump_pc) - 3), (value))	\
+	  : ( _jit_is_ucbranch((jump_pc) - 1)                   \
+             ? jit_patch_ucbranch((jump_pc) - 1, (value))       \
+             : jit_patch_branch((jump_pc) - 1, (value))))

+#define jit_patch_movi(movi_pc, val)					\
+	jit_patch_movei((movi_pc) - 2, (movi_pc) - 1, (val))
+
+#define	jit_arg_c()			(_jitl.nextarg_geti--)
+#define	jit_arg_i()			(_jitl.nextarg_geti--)
+#define	jit_arg_l()			(_jitl.nextarg_geti--)
+#define	jit_arg_p()			(_jitl.nextarg_geti--)
+#define	jit_arg_s()			(_jitl.nextarg_geti--)
+#define	jit_arg_uc()			(_jitl.nextarg_geti--)
+#define	jit_arg_ui()			(_jitl.nextarg_geti--)
+#define	jit_arg_ul()			(_jitl.nextarg_geti--)
+#define	jit_arg_us()			(_jitl.nextarg_geti--)
+
+/* Check Mach-O-Runtime documentation: Must skip GPR(s) whenever "corresponding" FPR is used */
+#define jit_arg_f()                    (_jitl.nextarg_geti-- ,_jitl.nextarg_getd++)
+#define jit_arg_d()                    (_jitl.nextarg_geti-=2,_jitl.nextarg_getd++)

-#define	jit_arg_c()			(_jitl.nextarg_get--)
-#define	jit_arg_i()			(_jitl.nextarg_get--)
-#define	jit_arg_l()			(_jitl.nextarg_get--)
-#define	jit_arg_p()			(_jitl.nextarg_get--)
-#define	jit_arg_s()			(_jitl.nextarg_get--)
-#define	jit_arg_uc()			(_jitl.nextarg_get--)
-#define	jit_arg_ui()			(_jitl.nextarg_get--)
-#define	jit_arg_ul()			(_jitl.nextarg_get--)
-#define	jit_arg_us()			(_jitl.nextarg_get--)
 #define jit_addi_i(d, rs, is)		jit_chk_ims((is), ADDICrri((d), (rs), (is)), ADDrrr((d), (rs), JIT_AUX))
 #define jit_addr_i(d, s1, s2)				  ADDrrr((d), (s1), (s2))
 #define jit_addci_i(d, rs, is)		jit_chk_ims((is), ADDICrri((d), (rs), (is)), ADDCrrr((d), (rs), JIT_AUX))
@ -159,9 +176,10 @@ struct jit_local_state {
 #define jit_bosubi_ui(label, rs, is)	(jit_chk_ims ((is), SUBICri((rs), (rs), is), SUBCrr((rs), JIT_AUX)),       MCRXRi(0), BEQi((label)), _jit.x.pc)
 #define jit_boaddr_ui(label, s1, s2)	(		  			     ADDCrr((s1), (s1), (s2)), 	   MCRXRi(0), BEQi((label)), _jit.x.pc)
 #define jit_bosubr_ui(label, s1, s2)	(		  			     SUBCrr((s1), (s1), (s2)), 	   MCRXRi(0), BEQi((label)), _jit.x.pc)
-#define jit_calli(label)	    (jit_movei(JIT_AUX, (label)), MTLRr(JIT_AUX), BLRL(), _jit.x.pc)
-#define jit_divi_i(d, rs, is)		jit_chk_ims(1111111, 0, DIVWrrr ((d), (rs), JIT_AUX))
-#define jit_divi_ui(d, rs, is)		jit_chk_imu(1111111, 0, DIVWUrrr((d), (rs), JIT_AUX))
+#define jit_calli(label)	        (jit_movi_p(JIT_AUX, (label)), MTCTRr(JIT_AUX), BCTRL(), _jitl.nextarg_puti = _jitl.nextarg_putf = _jitl.nextarg_putd = 0, _jit.x.pc)
+#define jit_callr(reg)			(MTCTRr(reg), BCTRL())
+#define jit_divi_i(d, rs, is)		jit_big_ims((is), DIVWrrr ((d), (rs), JIT_AUX))
+#define jit_divi_ui(d, rs, is)	jit_big_imu((is), DIVWUrrr((d), (rs), JIT_AUX))
 #define jit_divr_i(d, s1, s2)		DIVWrrr ((d), (s1), (s2))
 #define jit_divr_ui(d, s1, s2)	DIVWUrrr((d), (s1), (s2))
 #define jit_eqi_i(d, rs, is)		(jit_chk_ims((is), SUBIrri(JIT_AUX, (rs), (is)), SUBrrr(JIT_AUX, (rs), JIT_AUX)), SUBFICrri((d), JIT_AUX, 0), ADDErrr((d), (d), JIT_AUX))
@ -176,8 +194,8 @@ struct jit_local_state {
 #define jit_gti_ui(d, rs, is)		jit_ubooli ((d), (rs), (is), _gt)
 #define jit_gtr_i(d, s1, s2)		jit_sboolr ((d), (s1), (s2), _gt)
 #define jit_gtr_ui(d, s1, s2)		jit_uboolr ((d), (s1), (s2), _gt)
-#define jit_hmuli_i(d, rs, is)		jit_chk_ims(1111111, 0, MULHWrrr ((d), (rs), JIT_AUX))
-#define jit_hmuli_ui(d, rs, is)		jit_chk_imu(1111111, 0, MULHWUrrr((d), (rs), JIT_AUX))
+#define jit_hmuli_i(d, rs, is)		jit_big_ims((is), MULHWrrr ((d), (rs), JIT_AUX))
+#define jit_hmuli_ui(d, rs, is)		jit_big_imu((is), MULHWUrrr((d), (rs), JIT_AUX))
 #define jit_hmulr_i(d, s1, s2)				        MULHWrrr ((d), (s1), (s2))
 #define jit_hmulr_ui(d, s1, s2)				        MULHWUrrr((d), (s1), (s2))
 #define jit_jmpi(label)			(B_EXT((label)), _jit.x.pc)
@ -197,16 +215,18 @@ struct jit_local_state {
 #define jit_ler_i(d, s1, s2)		jit_sboolr2((d), (s1), (s2), _gt )
 #define jit_ler_ui(d, s1, s2)		jit_uboolr2((d), (s1), (s2), _gt )
 #define jit_lshi_i(d, rs, is)					     SLWIrri((d), (rs), (is))
-#define jit_lshr_i(d, s1, s2)		(ANDIrri(JIT_AUX, (s2), 31), SLWrrr ((d), (s1), JIT_AUX))
+#define jit_lshr_i(d, s1, s2)		(ANDI_rri(JIT_AUX, (s2), 31), SLWrrr ((d), (s1), JIT_AUX))
 #define jit_lti_i(d, rs, is)		jit_sbooli ((d), (rs), (is), _lt )
 #define jit_lti_ui(d, rs, is)		jit_ubooli ((d), (rs), (is), _lt )
 #define jit_ltr_i(d, s1, s2)		jit_sboolr ((d), (s1), (s2), _lt )
 #define jit_ltr_ui(d, s1, s2)		jit_uboolr ((d), (s1), (s2), _lt )
-#define jit_modi_i(d, rs, is)		_jit_mod(jit_divi_i (31, (rs), JIT_AUX), (is))
-#define jit_modi_ui(d, rs, is)		_jit_mod(jit_divi_ui(31, (rs), JIT_AUX), (irs))
+#define jit_modi_i(d, rs, is)		_jit_mod(jit_divi_i (31, (rs), JIT_AUX), (rs), (is))
+#define jit_modi_ui(d, rs, is)		_jit_mod(jit_divi_ui(31, (rs), JIT_AUX), (rs), (is))
 #define jit_modr_i(d, s1, s2)		(DIVWrrr(JIT_AUX, (s1), (s2)), MULLWrrr(JIT_AUX, JIT_AUX, (s2)), SUBrrr((d), (s1), JIT_AUX))
 #define jit_modr_ui(d, s1, s2)		(DIVWUrrr(JIT_AUX, (s1), (s2)), MULLWrrr(JIT_AUX, JIT_AUX, (s2)), SUBrrr((d), (s1), JIT_AUX))
 #define jit_movi_i(d, is)		MOVEIri((d), (is))
+#define jit_movi_p(d, is)		(LISri((d), _HI((is))),ORIrri((d),(d),_LO((is))),_jit.x.pc)
+
 #define jit_movr_i(d, rs)		MRrr((d), (rs))
 #define jit_muli_i(d, rs, is)		jit_chk_ims  ((is), MULLIrri((d), (rs), (is)), MULLWrrr((d), (rs), JIT_AUX))
 #define jit_muli_ui(d, rs, is)		jit_chk_imu15((is), MULLIrri((d), (rs), (is)), MULLWrrr((d), (rs), JIT_AUX))
@ -218,17 +238,19 @@ struct jit_local_state {
 #define jit_ori_i(d, rs, is)		jit_chk_imu((is), ORIrri((d), (rs), (is)), ORrrr((d), (rs), JIT_AUX))
 #define jit_orr_i(d, s1, s2)				  ORrrr((d), (s1), (s2))
 #define jit_popr_i(rs)			(LWZrm((rs), 0, 1), ADDIrri(1, 1, 4))
-#define jitfp_prepare(numi, numf, numd)	(_jitl.nextarg_put = 3 + (numi) + (numf) + 2*(numd))
+#define jit_prepare_i(numi)		(_jitl.nextarg_puti = numi)
+#define jit_prepare_f(numf)		(_jitl.nextarg_putf = numf)
+#define jit_prepare_d(numd)		(_jitl.nextarg_putd = numd)
 #define jit_prolog(n)			_jit_prolog(&_jit, (n))
 #define jit_pushr_i(rs)			STWUrm((rs), -4, 1)
-#define jit_pusharg_i(rs)		(--_jitl.nextarg_put, MRrr(_jitl.nextarg_put, (rs)))
-#define jit_ret()			jit_jmpr(31)
-#define jit_retval(rd)			MRrr((rd), 3)
+#define jit_pusharg_i(rs)		(--_jitl.nextarg_puti, MRrr((3 + _jitl.nextarg_putd * 2 + _jitl.nextarg_putf + _jitl.nextarg_puti), (rs)))
+#define jit_ret()			_jit_epilog(&_jit)
+#define jit_retval_i(rd)		MRrr((rd), 3)
 #define jit_rsbi_i(d, rs, is)		jit_chk_ims((is), SUBFICrri((d), (rs), (is)), SUBFCrrr((d), (rs), JIT_AUX))
 #define jit_rshi_i(d, rs, is)					     SRAWIrri((d), (rs), (is))
 #define jit_rshi_ui(d, rs, is)					     SRWIrri ((d), (rs), (is))
-#define jit_rshr_i(d, s1, s2)		(ANDIrrr(JIT_AUX, (s2), 31), SRAWrrr ((d), (s1), JIT_AUX))
-#define jit_rshr_ui(d, s1, s2)		(ANDIrrr(JIT_AUX, (s2), 31), SRWrrr  ((d), (s1), JIT_AUX))
+#define jit_rshr_i(d, s1, s2)		(ANDI_rri(JIT_AUX, (s2), 31), SRAWrrr ((d), (s1), JIT_AUX))
+#define jit_rshr_ui(d, s1, s2)		(ANDI_rri(JIT_AUX, (s2), 31), SRWrrr  ((d), (s1), JIT_AUX))
 #define jit_stxi_c(id, rd, rs)		jit_chk_ims((id), STBrm((rs), (id), (rd)), STBrx((rs), (rd), JIT_AUX))
 #define jit_stxi_i(id, rd, rs)		jit_chk_ims((id), STWrm((rs), (id), (rd)), STWrx((rs), (rd), JIT_AUX))
 #define jit_stxi_s(id, rd, rs)		jit_chk_ims((id), STHrm((rs), (id), (rd)), STHrx((rs), (rd), JIT_AUX))
@ -237,7 +259,7 @@ struct jit_local_state {
 #define jit_stxr_s(d1, d2, rs)				  STHrx((rs), (d1), (d2))
 #define jit_subr_i(d, s1, s2)				  SUBrrr((d), (s1), (s2))
 #define jit_subcr_i(d, s1, s2)				  SUBCrrr((d), (s1), (s2))
-#define jit_subxi_i(d, rs, is)		jit_chk_ims(111111111, 0, SUBErrr((d), (rs), JIT_AUX))
+#define jit_subxi_i(d, rs, is)		jit_big_ims((is), SUBErrr((d), (rs), JIT_AUX))
 #define jit_subxr_i(d, s1, s2)				  SUBErrr((d), (s1), (s2))
 #define jit_xori_i(d, rs, is)		jit_chk_imu((is), XORIrri((d), (rs), (is)), XORrrr((d), (rs), JIT_AUX))
 #define jit_xorr_i(d, s1, s2)				  XORrrr((d), (s1), (s2))
--- a/lightning/ppc/fp.h
+++ b/lightning/ppc/fp.h
@ -35,70 +35,177 @@
 #ifndef __lightning_asm_fp_h
 #define __lightning_asm_fp_h

-#if 0

-/* dummy for now */
+#define JIT_FPR_NUM	       6
+#define JIT_FPR(i)	       (8+(i))

-#define jit_add_two(reg0)	FADDrrr(13 - (reg0), 13 - (reg0), 12 - (reg0))
-#define jit_sub_two(reg0)	FSUBrrr(13 - (reg0), 13 - (reg0), 12 - (reg0))
-#define jit_mul_two(reg0)	FMULrrr(13 - (reg0), 13 - (reg0), 12 - (reg0))
-#define jit_div_two(reg0)	FDIVrrr(13 - (reg0), 13 - (reg0), 12 - (reg0))
-
-#define jit_abs(reg0)		FABSr(13 - (reg0))
-#define jit_sqrt(reg0)		FSQRTr(13 - (reg0))
-#define jit_neg(reg0)		FNEGr(13 - (reg0))
-
-#define jit_ldxi_f(reg0, rs, is) 0
-#define jit_ldxr_f(reg0, s1, s2) 0
-#define jit_ldxi_d(reg0, rs, is) 0
-#define jit_ldxr_d(reg0, s1, s2) 0
-#define jit_ldi_f(reg0, is) 0
-#define jit_ldr_f(reg0, rs) 0
-#define jit_ldi_d(reg0, is) 0
-#define jit_ldr_d(reg0, rs) 0
-#define jit_stxi_f(id, rd, reg0) 0
-#define jit_stxr_f(d1, d2, reg0) 0
-#define jit_stxi_d(id, rd, reg0) 0
-#define jit_stxr_d(d1, d2, reg0) 0
-#define jit_sti_f(id, reg0) 0
-#define jit_str_f(rd, reg0) 0
-#define jit_sti_d(id, reg0) 0
-#define jit_str_d(rd, reg0) 0
+#define JIT_FPFR	       0

 /* Make space for 1 or 2 words, store address in REG */
 #define jit_data(REG, D1)	        (_FBA	(18, 8, 0, 1),  _jit_L(D1), MFLRr(REG))
-#define jit_data2(REG, D1, D2)	(_FBA	(18, 12, 0, 1), _jit_L(D1), _jit_L(D2), MFLRr(REG))

-#define jit_fpimm(reg0, first, second)		\
-	(jit_data2(JIT_AUX, (first), (second)),	\
-	 jit_ldxi_d((reg0), JIT_AUX, 0))
+#define jit_addr_d(rd,s1,s2)  FADDDrrr((rd),(s1),(s2))
+#define jit_subr_d(rd,s1,s2)  FSUBDrrr((rd),(s1),(s2))
+#define jit_mulr_d(rd,s1,s2)  FMULDrrr((rd),(s1),(s2))
+#define jit_divr_d(rd,s1,s2)  FDIVDrrr((rd),(s1),(s2))

-#define jit_floor(rd, reg0)	jit_call_fp((rd), (reg0), floor)
-#define jit_ceil(rd, reg0)	jit_call_fp((rd), (reg0), ceil)
+#define jit_addr_f(rd,s1,s2)  FADDSrrr((rd),(s1),(s2))
+#define jit_subr_f(rd,s1,s2)  FSUBSrrr((rd),(s1),(s2))
+#define jit_mulr_f(rd,s1,s2)  FMULSrrr((rd),(s1),(s2))
+#define jit_divr_f(rd,s1,s2)  FDIVSrrr((rd),(s1),(s2))

-#define jit_call_fp(rd, reg0, fn)						\
-	jit_fail(#fn " not supported", __FILE__, __LINE__, __FUNCTION__)
-/*	pass reg0 as first parameter of rd
-	bl	fn
-	mr	r3, rd */
+#define jit_movr_d(rd,rs)     ( (rd) == (rs) ? 0 : FMOVErr((rd),(rs)))
+#define jit_movi_d(reg0,d) do {                   \
+      double _v = (d);                            \
+      _FBA (18, 12, 0, 1); 			  \
+      memcpy(_jit.x.uc_pc, &_v, sizeof (double)); \
+      _jit.x.uc_pc += sizeof (double);            \
+      MFLRr (JIT_AUX);				  \
+      jit_ldxi_d((reg0), JIT_AUX, 0);		  \
+   } while(0) 

-#define jit_trunc(rd, reg0)	(jit_data((rd), 0), 				\
-				FCTIWZrr(13 - (reg0), 13 - (reg0)),		\
-				STFIWXrrr(13 - (reg0), 0, (rd)),			\
-				LWZrm((rd), 0, (rd)))

-#define jit_round(rd, reg0)	(jit_data((rd), 0),				\
-				FCTIWrr(13 - (reg0), 13 - (reg0)),		\
-				STFIWXrrr(13 - (reg0), 0, (rd)),			\
-				LWZrm((rd), 0, (rd)))
+#define jit_movr_f(rd,rs)     ( (rd) == (rs) ? 0 : FMOVErr((rd),(rs)))
+#define jit_movi_f(reg0,f) do {                   \
+      float _v = (f);                             \
+      _FBA (18, 8, 0, 1); 			  \
+      memcpy(_jit.x.uc_pc, &_v, sizeof (float));  \
+      _jit.x.uc_pc += sizeof (float);             \
+      MFLRr (JIT_AUX);				  \
+      jit_ldxi_f((reg0), JIT_AUX, 0);		  \
+   } while(0) 

-#define jit_cmp(le, ge, reg0)	(FCMPOirr(7, 13 - (reg0), 0),	   	   \
-				CRORiii(28 + _gt, 28 + _gt, 28 + _eq),	   \
-				CRORiii(28 + _lt, 28 + _lt, 28 + _eq),	   \
-				MFCRr((ge)), 				   \
-				EXTRWIrrii((le), (ge), 1, 28 + _lt),	   \
-				EXTRWIrrii((ge), (ge), 1, 28 + _gt))

-#endif
+#define jit_abs_d(rd,rs)       FABSrr((rd),(rs))
+#define jit_negr_d(rd,rs)      FNEGrr((rd),(rs))
+#define jit_sqrt_d(rd,rs)      FSQRTDrr((rd),(rs))
+
+
+#define jit_ldxi_f(reg0, rs, is)    (_siP(16,(is)) ? LFSrri((reg0),(rs),(is)) : (MOVEIri(JIT_AUX,(is)),LFSxrrr((reg0),(rs),JIT_AUX))) 
+#define jit_ldxi_d(reg0, rs, is)    (_siP(16,(is)) ? LFDrri((reg0),(rs),(is)) : (MOVEIri(JIT_AUX,(is)),LFDxrrr((reg0),(rs),JIT_AUX)))
+#define jit_ldxr_f(reg0, s1, s2)    LFSxrrr((reg0),(s1),(s2))
+#define jit_ldxr_d(reg0, s1, s2)    LFDxrrr((reg0),(s1),(s2))
+#define jit_ldi_f(reg0, is)          (_siP(16,(is)) ? LFSrri((reg0),0,(is)) : (MOVEIri(JIT_AUX,(is)),LFSrri((reg0),JIT_AUX,0)))
+#define jit_ldi_d(reg0, is)          (_siP(16,(is)) ? LFDrri((reg0),0,(is)) : (MOVEIri(JIT_AUX,(is)),LFDrri((reg0),JIT_AUX,0)))
+#define jit_ldr_f(reg0, rs)          LFSrri((reg0),(rs),0)
+#define jit_ldr_d(reg0, rs)          LFDrri((reg0),(rs),0)
+#define jit_stxi_f(id, rd, reg0)     (_siP(16,(id)) ? STFSrri((reg0),(rd),(id)) : (MOVEIri(JIT_AUX,(id)),STFSrri((reg0),(rd),JIT_AUX))) 
+#define jit_stxi_d(id, rd, reg0)     (_siP(16,(id)) ? STFDrri((reg0),(rd),(id)) : (MOVEIri(JIT_AUX,(id)),STFDrri((reg0),(rd),JIT_AUX))) 
+#define jit_stxr_f(d1, d2, reg0)     STFSxrrr((reg0),(d1),(d2))
+#define jit_stxr_d(d1, d2, reg0)     STFDxrrr((reg0),(d1),(d2))
+#define jit_sti_f(id, reg0)          (_siP(16,(id)) ? STFSrri((reg0),0,(id)) : (MOVEIri(JIT_AUX,(id)),STFSrri((reg0),JIT_AUX,0)))
+#define jit_sti_d(id, reg0)          (_siP(16,(id)) ? STFDrri((reg0),0,(id)) : (MOVEIri(JIT_AUX,(id)),STFDrri((reg0),JIT_AUX,0)))
+#define jit_str_f(rd, reg0)          STFSrri((reg0),(rd),0)
+#define jit_str_d(rd, reg0)          STFDrri((reg0),(rd),0)
+
+#define jit_fpboolr(d, s1, s2, rcbit) (		\
+	FCMPOrrr(_cr0,(s1),(s2)),		\
+	MFCRr((d)),				\
+	EXTRWIrrii((d), (d), 1, (rcbit)))
+
+#define jit_fpboolr_neg(d, s1, s2,rcbit) (	\
+	FCMPOrrr(_cr0,(s1),(s2)),		\
+	MFCRr((d)),				\
+	EXTRWIrrii((d), (d), 1, (rcbit)),	\
+	XORIrri((d), (d), 1))
+
+#define jit_fpboolur(d, s1, s2, rcbit) (	\
+	FCMPUrrr(_cr0,(s1),(s2)),		\
+	MFCRr((d)),				\
+	EXTRWIrrii((d), (d), 1, (rcbit)))
+
+#define jit_fpboolur_neg(d, s1, s2,rcbit) (	\
+	FCMPUrrr(_cr0,(s1),(s2)),		\
+	MFCRr((d)),				\
+	EXTRWIrrii((d), (d), 1, (rcbit)),	\
+	XORIrri((d), (d), 1))
+
+#define jit_fpboolur_or(d, s1, s2, bit1, bit2) (\
+	FCMPUrrr(_cr0,(s1),(s2)),		\
+	CRORiii((bit1), (bit1), (bit2)),	\
+	MFCRr((d)),				\
+	EXTRWIrrii((d), (d), 1, (bit1)))
+
+#define jit_gtr_d(d, s1, s2)      jit_fpboolr ((d),(s1),(s2),_gt)   
+#define jit_ger_d(d, s1, s2)      jit_fpboolr_neg((d),(s1),(s2),_lt)   
+#define jit_ltr_d(d, s1, s2)      jit_fpboolr ((d),(s1),(s2),_lt)         
+#define jit_ler_d(d, s1, s2)      jit_fpboolr_neg((d),(s1),(s2),_gt)         
+#define jit_eqr_d(d, s1, s2)      jit_fpboolr ((d),(s1),(s2),_eq)         
+#define jit_ner_d(d, s1, s2)      jit_fpboolr_neg((d),(s1),(s2),_eq)
+#define jit_unordr_d(d, s1, s2)   jit_fpboolur ((d),(s1),(s2),_un)
+#define jit_ordr_d(d, s1, s2)     jit_fpboolur_neg((d),(s1),(s2),_un)
+#define jit_unler_d(d, s1, s2)    jit_fpboolur_neg ((d), (s1), (s2), _gt)
+#define jit_unltr_d(d, s1, s2)    jit_fpboolur_or ((d), (s1), (s2), _un, _lt)
+#define jit_unger_d(d, s1, s2)    jit_fpboolur_neg ((d), (s1), (s2), _lt)
+#define jit_ungtr_d(d, s1, s2)    jit_fpboolur_or ((d), (s1), (s2), _un, _gt)
+#define jit_ltgtr_d(d, s1, s2)    jit_fpboolur_or ((d), (s1), (s2), _gt, _lt)
+#define jit_uneqr_d(d, s1, s2)    jit_fpboolur_or ((d), (s1), (s2), _un, _eq)
+
+#define jit_fpbr(d, s1, s2, rcbit) (		\
+	FCMPOrrr(_cr0,(s1),(s2)),		\
+	BTii ((rcbit), (d)))
+
+#define jit_fpbr_neg(d, s1, s2,rcbit) (	\
+	FCMPOrrr(_cr0,(s1),(s2)),		\
+	BFii ((rcbit), (d)))
+
+#define jit_fpbur(d, s1, s2, rcbit) (		\
+	FCMPUrrr(_cr0,(s1),(s2)),		\
+	BTii ((rcbit), (d)))
+
+#define jit_fpbur_neg(d, s1, s2,rcbit) (	\
+	FCMPUrrr(_cr0,(s1),(s2)),		\
+	BFii ((rcbit), (d)))
+
+#define jit_fpbur_or(d, s1, s2, bit1, bit2) (	\
+	FCMPUrrr(_cr0,(s1),(s2)),		\
+	CRORiii((bit1), (bit1), (bit2)),	\
+	BTii ((bit1), (d)))
+
+#define jit_bgtr_d(d, s1, s2)      jit_fpbr ((d),(s1),(s2),_gt)   
+#define jit_bger_d(d, s1, s2)      jit_fpbr_neg((d),(s1),(s2),_lt)   
+#define jit_bltr_d(d, s1, s2)      jit_fpbr ((d),(s1),(s2),_lt)         
+#define jit_bler_d(d, s1, s2)      jit_fpbr_neg((d),(s1),(s2),_gt)         
+#define jit_beqr_d(d, s1, s2)      jit_fpbr ((d),(s1),(s2),_eq)         
+#define jit_bner_d(d, s1, s2)      jit_fpbr_neg((d),(s1),(s2),_eq)
+#define jit_bunordr_d(d, s1, s2)   jit_fpbur ((d),(s1),(s2),_un)
+#define jit_bordr_d(d, s1, s2)     jit_fpbur_neg((d),(s1),(s2),_un)
+#define jit_bunler_d(d, s1, s2)    jit_fpbur_neg ((d), (s1), (s2), _gt)
+#define jit_bunltr_d(d, s1, s2)    jit_fpbur_or ((d), (s1), (s2), _un, _lt)
+#define jit_bunger_d(d, s1, s2)    jit_fpbur_neg ((d), (s1), (s2), _lt)
+#define jit_bungtr_d(d, s1, s2)    jit_fpbur_or ((d), (s1), (s2), _un, _gt)
+#define jit_bltgtr_d(d, s1, s2)    jit_fpbur_or ((d), (s1), (s2), _gt, _lt)
+#define jit_buneqr_d(d, s1, s2)    jit_fpbur_or ((d), (s1), (s2), _un, _eq)
+
+#define jit_getarg_f(rd, ofs)        jit_movr_f((rd),(ofs))
+#define jit_getarg_d(rd, ofs)        jit_movr_d((rd),(ofs))
+#define jit_pusharg_d(rs)	     (_jitl.nextarg_putd--,jit_movr_d((_jitl.nextarg_putf+_jitl.nextarg_putd+1), (rs)))
+#define jit_pusharg_f(rs)	     (_jitl.nextarg_putf--,jit_movr_f((_jitl.nextarg_putf+_jitl.nextarg_putd+1), (rs)))
+#define jit_retval_d(op1)            jit_movr_d(1, (op1))
+#define jit_retval_f(op1)            jit_movr_f(1, (op1))
+
+
+#define jit_floorr_d_i(rd,rs)  (MTFSFIri(7,3), \
+                                  FCTIWrr(31,(rs)),    \
+                                  MOVEIri(JIT_AUX,-4), \
+                                  STFIWXrrr(31,JIT_SP,JIT_AUX),   \
+                                  LWZrm((rd),-4,JIT_SP))
+
+#define jit_ceilr_d_i(rd,rs)   (MTFSFIri(7,2), \
+                                  FCTIWrr(31,(rs)),    \
+                                  MOVEIri(JIT_AUX,-4), \
+                                  STFIWXrrr(31,JIT_SP,JIT_AUX),   \
+                                  LWZrm((rd),-4,JIT_SP))
+
+#define jit_roundr_d_i(rd,rs)  (MTFSFIri(7,0), \
+                                  FCTIWrr(31,(rs)),    \
+                                  MOVEIri(JIT_AUX,-4), \
+                                  STFIWXrrr(31,JIT_SP,JIT_AUX),   \
+                                  LWZrm((rd),-4,JIT_SP))
+
+#define jit_truncr_d_i(rd,rs)  (FCTIWZrr(31,(rs)), \
+                                  MOVEIri(JIT_AUX,-4), \
+                                  STFIWXrrr(31,JIT_SP,JIT_AUX),   \
+                                  LWZrm((rd),-4,JIT_SP))

 #endif /* __lightning_asm_h */
--- a/lightning/ppc/funcs.h
+++ b/lightning/ppc/funcs.h
@ -7,7 +7,7 @@

 /***********************************************************************
 *
- * Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+ * Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
 * Written by Paolo Bonzini.
 *
 * This file is part of GNU lightning.
@ -69,13 +69,13 @@ jit_flush_code(void *start, void *end)
  end -= ((long) end) & (cache_line_size - 1);

  /* Force data cache write-backs */
-  for (ddest = start; ddest <= (char *) end; ddest += cache_line_size) {
+  for (ddest = (char *) start; ddest <= (char *) end; ddest += cache_line_size) {
    __asm__ __volatile__ ("dcbst 0,%0" : : "r"(ddest));
  }
  __asm__ __volatile__ ("sync" : : );

  /* Now invalidate the instruction cache */
-  for (idest = start; idest <= (char *) end; idest += cache_line_size) {
+  for (idest = (char *) start; idest <= (char *) end; idest += cache_line_size) {
    __asm__ __volatile__ ("icbi 0,%0" : : "r"(idest));
  }
  __asm__ __volatile__ ("isync" : : );
@ -85,75 +85,78 @@ jit_flush_code(void *start, void *end)

 #define _jit (*jit)

-/* Emit a trampoline for a function.
- * Upon entrance to the trampoline:
- *   - R0      = return address for the function
- *   - LR      = address where the real code for the function lies
- *   - R3-R8   = parameters
- * After jumping to the address pointed to by R10:
- *   - LR      = address where the epilog lies (the function must return there)
- *   - R25-R20 = parameters (order is reversed, 1st argument is R25)
- */
-static jit_insn *
-_jit_trampoline(jit, n)
-     register jit_state *jit;
-     register int	n;
+static void
+_jit_epilog(jit_state *jit)
 {
-  static jit_insn	trampolines[200];
-  static jit_insn	*p_trampolines[6], *free = trampolines;
-  jit_insn		*trampo;
-  int			i, ofs, frame_size;
+  int n = _jitl.nbArgs;
+  int frame_size, i, ofs;
+  int first_saved_reg = JIT_AUX - n;
+  int num_saved_regs = 32 - first_saved_reg;

-  if (!p_trampolines[n]) {
-    _jit.x.pc = trampo = p_trampolines[n] = free;
-
-    frame_size = 24 + (6 + n) * 4;	/* r26..r31 + args		   */
+  frame_size = 24 + 32 + num_saved_regs * 4;	/* r24..r31 + args		   */
  frame_size += 15;			/* the stack must be quad-word     */
  frame_size &= ~15;			/* aligned			   */

-    STWUrm(1, -frame_size, 1);		/* stwu  r1, -x(r1)		   */
-
-    for (ofs = frame_size - (6 + n) * 4, i = 26 - n; i <= 31; ofs += 4, i++) {
-      STWrm(i, ofs, 1);			/* stw   rI, ofs(r1)		   */
-    }
-    STWrm(0, ofs+4, 1);			/* stw   r0, x(r1)		   */
-    for (i = 0; i < n; i++) {
-      MRrr(25-i, 3+i);			/* save parameters in r25..r20	   */
-    }
-    BLRL();				/* blrl				   */
-    LWZrm(0, ofs+4, 1);			/* lwz   r0, x(r1)  (ret.addr.)    */
+#ifdef _CALL_DARWIN
+  LWZrm(0, frame_size + 8, 1);	/* lwz   r0, x+8(r1)  (ret.addr.)  */
+#else
+  LWZrm(0, frame_size + 4, 1);	/* lwz   r0, x+4(r1)  (ret.addr.)  */
+#endif
  MTLRr(0);				/* mtspr LR, r0			   */

-    for (ofs = frame_size - (6 + n) * 4, i = 26 - n; i <= 31; ofs += 4, i++) {
-      LWZrm(i, ofs, 1);			/* lwz   rI, ofs(r1)		   */
-    }
+  ofs = frame_size - num_saved_regs * 4;
+  LMWrm(first_saved_reg, ofs, 1);	/* lmw   rI, ofs(r1)		   */
  ADDIrri(1, 1, frame_size);		/* addi  r1, r1, x		   */
  BLR();				/* blr				   */
-
-    jit_flush_code(trampo, _jit.x.pc);
-    free = _jit.x.pc;
-  }
-
-  return p_trampolines[n];
 }

+/* Emit a prolog for a function.
+   Upon entrance to the trampoline:
+     - LR      = address where the real code for the function lies
+     - R3-R8   = parameters
+   Upon finishing the trampoline:
+     - R0      = return address for the function
+     - R25-R20 = parameters (order is reversed, 1st argument is R25)
+  
+   The +32 in frame_size computation is to accound for the parameter area of
+   a function frame. 
+
+   On PPC the frame must have space to host the arguments of any callee.
+   However, as it currently stands, the argument to jit_trampoline (n) is
+   the number of arguments of the caller we generate. Therefore, the
+   callee can overwrite a part of the stack (saved register area when it
+   flushes its own parameter on the stack. The addition of a constant 
+   offset = 32 is enough to hold eight 4 bytes arguments.  This is less
+   than perfect but is a reasonable work around for now. 
+   Better solution must be investigated.  */
 static void
-_jit_prolog(jit, n)
-     register jit_state *jit;
-     register int	n;
+_jit_prolog(jit_state *jit, int n)
 {
-  register jit_insn	*save_pc, *trampo;
+  int frame_size;
+  int ofs, i;
+  int first_saved_reg = JIT_AUX - n;
+  int num_saved_regs = 32 - first_saved_reg;

-  save_pc = _jit.x.pc;
-  trampo = _jit_trampoline(jit, n);
-  _jit.x.pc = save_pc;
+  _jitl.nextarg_geti = JIT_AUX - 1;
+  _jitl.nextarg_getd = 1;
+  _jitl.nbArgs = n;
+
+  frame_size = 24 + 32 + num_saved_regs * 4;	/* r27..r31 + args		   */
+  frame_size += 15;			/* the stack must be quad-word     */
+  frame_size &= ~15;			/* aligned			   */

-  _jitl.nextarg_get = 25;
  MFLRr(0);
-  MOVEIri(10, trampo);
-  MTLRr(10);
-  BLRL();				/* blrl				  */
-  MFLRr(31);				/* mflr  r31			  */
+  STWUrm(1, -frame_size, 1);		/* stwu  r1, -x(r1)		   */
+
+  ofs = frame_size - num_saved_regs * 4;
+  STMWrm(first_saved_reg, ofs, 1);		/* stmw  rI, ofs(r1)		   */
+#ifdef _CALL_DARWIN
+  STWrm(0, frame_size + 8, 1);		/* stw   r0, x+8(r1)		   */
+#else
+  STWrm(0, frame_size + 4, 1);		/* stw   r0, x+4(r1)		   */
+#endif
+  for (i = 0; i < n; i++)
+    MRrr(JIT_AUX-1-i, 3+i);		/* save parameters below r24	   */
 }

 #undef _jit
--- a/lightning/sparc/asm.h
+++ b/lightning/sparc/asm.h
@ -49,6 +49,7 @@

 typedef unsigned int jit_insn;

+#ifndef LIGHTNING_DEBUG
 #define _d30(BD)	((_jit_UL(BD) - _jit_UL(_jit.x.pc))>>2)
 #define _d22(BD)	_ck_d(22, _d30(BD))

@ -82,6 +83,9 @@ typedef unsigned int jit_insn;
 #define _3( RD, OP3, RS1, I, ASI, RS2)	_jit_I((3<<30)|		(_u5(RD)<<25)|(_u6(OP3)<<19)|(_u5(RS1)<<14)|(_u1(I)<<13)|(_u8(ASI)<<5)|_u5 (RS2))
 #define _3i(RD, OP3, RS1, I,	  IMM)	_jit_I((3<<30)|		(_u5(RD)<<25)|(_u6(OP3)<<19)|(_u5(RS1)<<14)|(_u1(I)<<13)|	       _s13(IMM))

+#define _FP1(RD, RS1, OPF, RS2)	_2f((RD), 52, (RS1), (OPF), (RS2))
+#define _FP2(RD, RS1, OPF, RS2)	_2f((RD), 53, (RS1), (OPF), (RS2))
+
 /* basic instructions  [Section B, page 87] */

 #define ADDrrr(RS1, RS2, RD)	_2   ((RD),  0, (RS1), 0, 0, (RS2))
@ -300,4 +304,80 @@ typedef unsigned int jit_insn;
 #define WRii(IMM, RD)		WRrii(0, (IMM), (RD))
 #define WRri(RS2, RD)		WRrri(0, (RS2), (RD))

+#define LDFSRx(RS1, RS2)	_3   (0, 33, (RS1), 0, 0, (RS2))
+#define LDFSRm(RS1, IMM)	_3i  (0, 33, (RS1), 1,    (IMM))
+#define STFSRx(RD1, RD2)	_3   (0, 37, (RD1), 0, 0, (RD2))
+#define STFSRm(RD, IMM)		_3i  (0, 37, (RD),  1,    (IMM))
+
+#define FITODrr(FRS, FRD)		_FP1((FRD),  0, 200, (FRS))
+#define FITOSrr(FRS, FRD)		_FP1((FRD),  0, 196, (FRS))
+#define FDTOIrr(FRS, FRD)		_FP1((FRD),  0, 210, (FRS))
+#define FSTOIrr(FRS, FRD)		_FP1((FRD),  0, 209, (FRS))
+#define FSTODrr(FRS, FRD)		_FP1((FRD),  0, 201, (FRS))
+#define FDTOSrr(FRS, FRD)		_FP1((FRD),  0, 198, (FRS))
+#define FMOVSrr(FRS, FRD)		_FP1((FRD),  0,   1, (FRS))
+#define FNEGSrr(FRS, FRD)		_FP1((FRD),  0,   5, (FRS))
+#define FABSSrr(FRS, FRD)		_FP1((FRD),  0,   9, (FRS))
+#define FMOVDrr(FRS, FRD)		_FP1((FRD),  0,   2, (FRS))
+#define FNEGDrr(FRS, FRD)		_FP1((FRD),  0,   6, (FRS))
+#define FABSDrr(FRS, FRD)		_FP1((FRD),  0,  10, (FRS))
+#define FSQRTDrr(FRS, FRD)		_FP1((FRD),  0,  42, (FRS))
+#define FSQRTSrr(FRS, FRD)		_FP1((FRD),  0,  41, (FRS))
+
+#define FADDSrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  65, (FRS2))
+#define FSUBSrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  69, (FRS2))
+#define FMULSrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  73, (FRS2))
+#define FDIVSrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  77, (FRS2))
+
+#define FADDDrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  66, (FRS2))
+#define FSUBDrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  70, (FRS2))
+#define FMULDrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  74, (FRS2))
+#define FDIVDrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  78, (FRS2))
+
+#define FCMPSrr(FRS1, FRS2)		_FP2(0,      (FRS1),  81, (FRS2))
+#define FCMPDrr(FRS1, FRS2)		_FP2(0,      (FRS1),  82, (FRS2))
+
+#define LDFxr(RS1, RS2, RD)	_3   ((RD), 32, (RS1), 0, 0, (RS2))
+#define LDFmr(RS1, IMM, RD)	_3i  ((RD), 32, (RS1), 1,    (IMM))
+#define LDDFxr(RS1, RS2, RD)	_3   ((RD), 35, (RS1), 0, 0, (RS2))
+#define LDDFmr(RS1, IMM, RD)	_3i  ((RD), 35, (RS1), 1,    (IMM))
+#define STFrx(RS, RD1, RD2)	_3   ((RS), 36, (RD1), 0, 0, (RD2))
+#define STFrm(RS, RD1, IMM)	_3i  ((RS), 36, (RD1), 1,    (IMM))
+#define STDFrx(RS, RD1, RD2)	_3   ((RS), 39, (RD1), 0, 0, (RD2))
+#define STDFrm(RS, RD1, IMM)	_3i  ((RS), 39, (RD1), 1,    (IMM))
+
+#define FBNi(DISP)		_0   (0,  0, 6, (DISP))
+#define FBN_Ai(DISP)		_0   (1,  0, 6, (DISP))
+#define FBNEi(DISP)		_0   (0,  1, 6, (DISP))
+#define FBNE_Ai(DISP)		_0   (1,  1, 6, (DISP))
+#define FBLGi(DISP)		_0   (0,  2, 6, (DISP))
+#define FBLG_Ai(DISP)		_0   (1,  2, 6, (DISP))
+#define FBULi(DISP)		_0   (0,  3, 6, (DISP))
+#define FBUL_Ai(DISP)		_0   (1,  3, 6, (DISP))
+#define FBLi(DISP)		_0   (0,  4, 6, (DISP))
+#define FBL_Ai(DISP)		_0   (1,  4, 6, (DISP))
+#define FBUGi(DISP)		_0   (0,  5, 6, (DISP))
+#define FBUG_Ai(DISP)		_0   (1,  5, 6, (DISP))
+#define FBGi(DISP)		_0   (0,  6, 6, (DISP))
+#define FBG_Ai(DISP)		_0   (1,  6, 6, (DISP))
+#define FBUi(DISP)		_0   (0,  7, 6, (DISP))
+#define FBU_Ai(DISP)		_0   (1,  7, 6, (DISP))
+#define FBAi(DISP)		_0   (0,  8, 6, (DISP))
+#define FBA_Ai(DISP)		_0   (1,  8, 6, (DISP))
+#define FBEi(DISP)		_0   (0,  9, 6, (DISP))
+#define FBE_Ai(DISP)		_0   (1,  9, 6, (DISP))
+#define FBUEi(DISP)		_0   (0, 10, 6, (DISP))
+#define FBUE_Ai(DISP)		_0   (1, 10, 6, (DISP))
+#define FBGEi(DISP)		_0   (0, 11, 6, (DISP))
+#define FBGE_Ai(DISP)		_0   (1, 11, 6, (DISP))
+#define FBUGEi(DISP)		_0   (0, 12, 6, (DISP))
+#define FBUGE_Ai(DISP)		_0   (1, 12, 6, (DISP))
+#define FBLEi(DISP)		_0   (0, 13, 6, (DISP))
+#define FBLE_Ai(DISP)		_0   (1, 13, 6, (DISP))
+#define FBULEi(DISP)		_0   (0, 14, 6, (DISP))
+#define FBULE_Ai(DISP)		_0   (1, 14, 6, (DISP))
+#define FBOi(DISP)		_0   (0, 15, 6, (DISP))
+#define FBO_Ai(DISP)		_0   (1, 15, 6, (DISP))
+
+#endif
 #endif /* __ccg_asm_sparc_h */
--- a/lightning/sparc/core.h
+++ b/lightning/sparc/core.h
@ -33,14 +33,13 @@
 #ifndef __lightning_core_h
 #define __lightning_core_h

-#define JIT_R0			_Rl(0)
-#define JIT_R1			_Rl(1)
-#define JIT_R2			_Rl(2)
-#define JIT_V0			_Rl(3)
-#define JIT_V1			_Rl(4)
-#define JIT_V2			_Rl(5)
+#define JIT_R_NUM		3
+#define JIT_V_NUM		6
+#define JIT_R(i)		((i) ? _Rl((i) - 1) : _Rg(2))
+#define JIT_V(i)		_Rl((i)+2)
+
 #define JIT_BIG			_Rg(1)	/* %g1 used to make 32-bit operands */
-#define JIT_BIG2		_Rg(2)	/* %g2 used to make 32-bit compare operands */
+#define JIT_BIG2		_Ro(7)	/* %o7 used to make 32-bit compare operands */
 #define JIT_SP			_Ro(6)
 #define JIT_RZERO		_Rg(0)
 #define JIT_RET			_Ri(0)
@ -94,10 +93,18 @@ struct jit_local_state {
 #define jit_prepare_y(rs, is)		(SRArir(rs, 31, JIT_BIG), WRri(JIT_BIG, _y), NOP(), NOP(), NOP(), _jit.x.pc -= jit_immsize(is))
 #define jit_clr_y(rs, is)		(			  WRri(0,	_y), NOP(), NOP(), NOP(), _jit.x.pc -= jit_immsize(is))

-#define jit_mod(div, mul, d, s1, s2) (					\
-	div (JIT_BIG2, s1, s2),						\
-	mul (JIT_BIG2, JIT_BIG2, s2),					\
-	jit_subr_i (d, s1, JIT_BIG2))
+#define jit_modr(jit_div, jit_mul, d, s1, s2)   \
+        (jit_div (JIT_BIG, s1, s2),             \
+         jit_mul (JIT_BIG, JIT_BIG, s2),        \
+         jit_subr_i (d, s1, JIT_BIG))
+
+#define jit_modi(jit_divi, jit_muli, jit_divr, jit_mulr, d, rs, is)     \
+        (_siP(13,(imm))                                                 \
+         ? (jit_divi (JIT_BIG, rs, is),                                 \
+            jit_muli (JIT_BIG, JIT_BIG, is),                            \
+            jit_subr_i (d, rs, JIT_BIG))                                \
+         : (SETir ((is), JIT_BIG2),                                     \
+            jit_modr (jit_divr, jit_mulr, d, rs, JIT_BIG2)))

 /* How many instruction are needed to put imm in a register.  */
 #define jit_immsize(imm)	(!(imm) ? 0 :			\
@ -107,10 +114,16 @@ struct jit_local_state {
 /* branch instructions return the address of the *delay* instruction -- this
 * is just a helper macro that makes jit_patch more readable.
 */
-#define jit_patch_(jump_pc)						\
+#define jit_patch_(jump_pc,pv)						\
 	(*jump_pc &= ~_MASK(22),					\
-	 *jump_pc |= ((_jit_UL(_jit.x.pc) - _jit_UL(jump_pc)) >> 2) & _MASK(22))
+	 *jump_pc |= ((_jit_UL((pv)) - _jit_UL(jump_pc)) >> 2) & _MASK(22))

+#define jit_patch_set(sethi_pc, or_pc, dest)			\
+	(*(sethi_pc) &= ~_MASK(22), *(sethi_pc) |= _HI(dest),	\
+	 *(or_pc) &= ~_MASK(13), *(or_pc) |= _LO(dest))		\
+
+#define jit_patch_movi(movi_pc, val)				\
+	jit_patch_set((movi_pc) - 2, (movi_pc) - 1, (val))

 #define	jit_arg_c()			(_jitl.nextarg_get++)
 #define	jit_arg_i()			(_jitl.nextarg_get++)
@ -162,8 +175,10 @@ struct jit_local_state {
 #define jit_boaddr_ui(label, s1, s2)	(		   ADDCCrrr((s1), (s2), (s1)),			         BCSi((label)), NOP(), _jit.x.pc - 1)
 #define jit_bosubr_ui(label, s1, s2)	(		   SUBCCrrr((s1), (s2), (s1)),			         BCSi((label)), NOP(), _jit.x.pc - 1)
 #define jit_calli(label)		(CALLi(label), NOP(), _jit.x.pc - 1)
+#define jit_callr(reg)			(CALLx((reg), 0), NOP())
+
 #define jit_divi_i(d, rs, is)		(jit_prepare_y((rs), 0x12345678), SETir((is), JIT_BIG), SDIVrrr((rs), JIT_BIG, (d)) )
-#define jit_divi_ui(d, rs, is)		(jit_clr_y((rs)),    0x12345678), SETir((is), JIT_BIG), UDIVrrr((rs), JIT_BIG, (d)) )
+#define jit_divi_ui(d, rs, is)		(jit_clr_y((rs),     0x12345678), SETir((is), JIT_BIG), UDIVrrr((rs), JIT_BIG, (d)) )
 #define jit_divr_i(d, s1, s2)		(jit_prepare_y((s1), 0), 				SDIVrrr((s1), (s2), (d)))
 #define jit_divr_ui(d, s1, s2)		(jit_clr_y((s1),     0), 				UDIVrrr((s1), (s2), (d)))
 #define jit_eqi_i(d, rs, is)		jit_chk_imm((is), \
@ -208,11 +223,12 @@ struct jit_local_state {
 #define jit_lti_ui(d, rs, is)		jit_booli ((d), (rs), (is), BLUi(_jit.x.pc + 3) )
 #define jit_ltr_i(d, s1, s2)		jit_boolr ((d), (s1), (s2), BLi(_jit.x.pc + 3)  )
 #define jit_ltr_ui(d, s1, s2)		jit_boolr ((d), (s1), (s2), BLUi(_jit.x.pc + 3) )
-#define jit_modi_i(d, rs, is)		jit_modi(jit_divi_i, jit_muli_i, (d), (rs), (is))
-#define jit_modi_ui(d, rs, is)		jit_modi(jit_divi_i, jit_muli_i, (d), (rs), (is))
+#define jit_modi_i(d, rs, is)           jit_modi(jit_divi_i, jit_muli_i, jit_divr_i, jit_mulr_i, (d), (rs), (is))
+#define jit_modi_ui(d, rs, is)          jit_modi(jit_divi_ui, jit_muli_ui, jit_divr_ui, jit_mulr_ui, (d), (rs), (is))
 #define jit_modr_i(d, s1, s2)           jit_modr(jit_divr_i, jit_mulr_i, (d), (s1), (s2))
-#define jit_modr_ui(d, s1, s2)		jit_modr(jit_divr_i, jit_mulr_i, (d), (s1), (s2))
+#define jit_modr_ui(d, s1, s2)          jit_modr(jit_divr_ui, jit_mulr_ui, (d), (s1), (s2))
 #define jit_movi_i(d, is)		SETir((is), (d))
+#define jit_movi_p(d, is)		(SETir2(_HI((is)), _LO((is)), (d)), _jit.x.pc)
 #define jit_movr_i(d, rs)		MOVrr((rs), (d))
 #define jit_muli_i(d, rs, is)		jit_chk_imm((is), SMULrir((rs), (is), (d)), SMULrrr((rs), JIT_BIG, (d)))
 #define jit_muli_ui(d, rs, is)		jit_chk_imm((is), UMULrir((rs), (is), (d)), UMULrrr((rs), JIT_BIG, (d)))
@ -221,14 +237,14 @@ struct jit_local_state {
 #define jit_nop()			NOP()
 #define jit_ori_i(d, rs, is)		jit_chk_imm((is), ORrir((rs), (is), (d)), ORrrr((rs), JIT_BIG, (d)))
 #define jit_orr_i(d, s1, s2)				  ORrrr((s1), (s2), (d))
-#define jit_patch(delay_pc)		jit_patch_ ( ((delay_pc) - 1) )
+#define jit_patch_at(delay_pc, pv)	jit_patch_ (((delay_pc) - 1) , (pv))
 #define jit_popr_i(rs)			(LDmr(JIT_SP, 0, (rs)), ADDrir(JIT_SP, 8, JIT_SP))
-#define jitfp_prepare(numargs, nf, nd)	(_jitl.nextarg_put = (numargs))
-#define jit_prolog(numargs)		(SAVErir(JIT_SP, -96, JIT_SP), _jitl.nextarg_get = _Ri(0))
+#define jit_prepare_i(num)		(_jitl.nextarg_put += (num))
+#define jit_prolog(numargs)		(SAVErir(JIT_SP, -120, JIT_SP), _jitl.nextarg_get = _Ri(0))
 #define jit_pushr_i(rs)			(STrm((rs), JIT_SP, -8), SUBrir(JIT_SP, 8, JIT_SP))
 #define jit_pusharg_i(rs)		(--_jitl.nextarg_put, MOVrr((rs), _Ro(_jitl.nextarg_put)))
 #define jit_ret()			(RET(), RESTORE())
-#define jit_retval(rd)			MOVrr(_Ro(0), (rd))
+#define jit_retval_i(rd)		MOVrr(_Ro(0), (rd))
 #define jit_rshi_i(d, rs, is)		SRArir((rs), (is), (d))
 #define jit_rshi_ui(d, rs, is)		SRLrir((rs), (is), (d))
 #define jit_rshr_i(d, r1, r2)		SRArrr((r1), (r2), (d))
--- a/lightning/sparc/fp.h
+++ b/lightning/sparc/fp.h
@ -7,7 +7,7 @@

 /***********************************************************************
 *
- * Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
+ * Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
 * Written by Paolo Bonzini.
 *
 * This file is part of GNU lightning.
@ -35,99 +35,61 @@
 #ifndef __lightning_asm_fp_h
 #define __lightning_asm_fp_h

-#if 0
+#define JIT_FPR_NUM	6
+#define JIT_FPR(i)	(30-(i)*2)
+#define JIT_FPTMP	18

-/* dummy for now */
+#define jit_addr_f(rd,s1,s2)	FADDSrrr((s1), (s2), (rd))
+#define jit_subr_f(rd,s1,s2)	FSUBSrrr((s1), (s2), (rd))
+#define jit_mulr_f(rd,s1,s2)	FMULSrrr((s1), (s2), (rd))
+#define jit_divr_f(rd,s1,s2)	FDIVSrrr((s1), (s2), (rd))

-#define _FP1(RD, RS1, OPF, RS2)	_2f((RD), 52, (RS1), (OPF), (RS2))
-#define _FP2(RD, RS1, OPF, RS2)	_2f((RD), 53, (RS1), (OPF), (RS2))
+#define jit_addr_d(rd,s1,s2)	FADDDrrr((s1), (s2), (rd))
+#define jit_subr_d(rd,s1,s2)	FSUBDrrr((s1), (s2), (rd))
+#define jit_mulr_d(rd,s1,s2)	FMULDrrr((s1), (s2), (rd))
+#define jit_divr_d(rd,s1,s2)	FDIVDrrr((s1), (s2), (rd))

-#define FITODrr(FRS, FRD)		_FP1((FRD),  0, 200, (FRS))
-#define FDTOIrr(FRS, FRD)		_FP1((FRD),  0, 210, (FRS))
-#define FSTODrr(FRS, FRD)		_FP1((FRD),  0, 201, (FRS))
-#define FDTOSrr(FRS, FRD)		_FP1((FRD),  0, 198, (FRS))
-#define FMOVSrr(FRS, FRD)		_FP1((FRD),  0,   1, (FRS))
-#define FNEGSrr(FRS, FRD)		_FP1((FRD),  0,   5, (FRS))
-#define FABSSrr(FRS, FRD)		_FP1((FRD),  0,   9, (FRS))
-#define FSQRTDrr(FRS, FRD)		_FP1((FRD),  0,  74, (FRS))
+#define jit_movr_f(rd,rs)	FMOVSrr((rs), (rd))
+#define jit_abs_d(rd,rs)	FABSSrr((rs), (rd))
+#define jit_negr_d(rd,rs)	FNEGSrr((rs), (rd))
+#define jit_sqrt_d(rd,rs)	FSQRTSrr((rs), (rd))
+#define jit_movr_d(rd,rs)	FMOVDrr((rs), (rd))
+#define jit_abs_f(rd,rs)	FABSDrr((rs), (rd))
+#define jit_negr_f(rd,rs)	FNEGDrr((rs), (rd))
+#define jit_sqrt_f(rd,rs)	FSQRTDrr((rs), (rd))
+#define jit_extr_f_d(rs, rd)	FSTODrr((rs), (rd))
+#define jit_extr_d_f(rs, rd)	FDTOSrr((rs), (rd))

-#define FADDDrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  66, (FRS2))
-#define FSUBDrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  70, (FRS2))
-#define FMULDrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  82, (FRS2))
-#define FDIVDrrr(FRS1, FRS2, FRD)	_FP1((FRD),  (FRS1),  86, (FRS2))
+#define jit_movi_f(rd,immf)			  \
+    do {					  \
+      float _v = (immf);                          \
+      _1(_jit.x.pc + 3), LDFmr(_Ro(7), 8, (rd));  \
+      memcpy(_jit.x.uc_pc, &_v, sizeof (float));  \
+      _jit.x.uc_pc += sizeof (float);             \
+    } while(0)

-#define FCMPDrr(FRS1, FRS2)		_FP2(0,      (FRS1),  82, (FRS2))
-
-#define LDFxr(RS1, RS2, RD)	_3   ((RD), 32, (RS1), 0, 0, (RS2))
-#define LDFmr(RS1, IMM, RD)	_3i  ((RD), 32, (RS1), 1,    (IMM))
-#define LDDFxr(RS1, RS2, RD)	_3   ((RD), 35, (RS1), 0, 0, (RS2))
-#define LDDFmr(RS1, IMM, RD)	_3i  ((RD), 35, (RS1), 1,    (IMM))
-#define STFrx(RS, RD1, RD2)	_3   ((RS), 36, (RD1), 0, 0, (RD2))
-#define STFrm(RS, RD1, IMM)	_3i  ((RS), 36, (RD1), 1,    (IMM))
-#define STDFrx(RS, RD1, RD2)	_3   ((RS), 39, (RD1), 0, 0, (RD2))
-#define STDFrm(RS, RD1, IMM)	_3i  ((RS), 39, (RD1), 1,    (IMM))
-
-#define FBNi(DISP)		_0   (0,  0, 6, (DISP))
-#define FBN_Ai(DISP)		_0   (1,  0, 6, (DISP))
-#define FBNEi(DISP)		_0   (0,  1, 6, (DISP))
-#define FBNE_Ai(DISP)		_0   (1,  1, 6, (DISP))
-#define FBLGi(DISP)		_0   (0,  2, 6, (DISP))
-#define FBLG_Ai(DISP)		_0   (1,  2, 6, (DISP))
-#define FBULi(DISP)		_0   (0,  3, 6, (DISP))
-#define FBUL_Ai(DISP)		_0   (1,  3, 6, (DISP))
-#define FBLi(DISP)		_0   (0,  4, 6, (DISP))
-#define FBL_Ai(DISP)		_0   (1,  4, 6, (DISP))
-#define FBUGi(DISP)		_0   (0,  5, 6, (DISP))
-#define FBUG_Ai(DISP)		_0   (1,  5, 6, (DISP))
-#define FBGi(DISP)		_0   (0,  6, 6, (DISP))
-#define FBG_Ai(DISP)		_0   (1,  6, 6, (DISP))
-#define FBUi(DISP)		_0   (0,  7, 6, (DISP))
-#define FBU_Ai(DISP)		_0   (1,  7, 6, (DISP))
-#define FBAi(DISP)		_0   (0,  8, 6, (DISP))
-#define FBA_Ai(DISP)		_0   (1,  8, 6, (DISP))
-#define FBEi(DISP)		_0   (0,  9, 6, (DISP))
-#define FBE_Ai(DISP)		_0   (1,  9, 6, (DISP))
-#define FBUEi(DISP)		_0   (0, 10, 6, (DISP))
-#define FBUE_Ai(DISP)		_0   (1, 10, 6, (DISP))
-#define FBGEi(DISP)		_0   (0, 11, 6, (DISP))
-#define FBGE_Ai(DISP)		_0   (1, 11, 6, (DISP))
-#define FBUGEi(DISP)		_0   (0, 12, 6, (DISP))
-#define FBUGE_Ai(DISP)		_0   (1, 12, 6, (DISP))
-#define FBLEi(DISP)		_0   (0, 13, 6, (DISP))
-#define FBLE_Ai(DISP)		_0   (1, 13, 6, (DISP))
-#define FBULEi(DISP)		_0   (0, 14, 6, (DISP))
-#define FBULE_Ai(DISP)		_0   (1, 14, 6, (DISP))
-#define FBOi(DISP)		_0   (0, 15, 6, (DISP))
-#define FBO_Ai(DISP)		_0   (1, 15, 6, (DISP))
-
-#define FSKIPUG()		_0d  (1, 13, 6, 2)	/* fble,a .+8 */
-#define FSKIPUL()		_0d  (1, 11, 6, 2)	/* fbge,a .+8 */
-
-#define jit_add_two(reg0)	FADDDrrr(30 - (reg0) * 2, 28 - (reg0) * 2, 30 - (reg0) * 2)
-#define jit_sub_two(reg0)	FSUBDrrr(30 - (reg0) * 2, 28 - (reg0) * 2, 30 - (reg0) * 2)
-#define jit_mul_two(reg0)	FMULDrrr(30 - (reg0) * 2, 28 - (reg0) * 2, 30 - (reg0) * 2)
-#define jit_div_two(reg0)	FDIVDrrr(30 - (reg0) * 2, 28 - (reg0) * 2, 30 - (reg0) * 2)
-
-#define jit_abs(reg0)		FABSSrr(30 - (reg0) * 2, 30 - (reg0) * 2)
-#define jit_neg(reg0)		FNEGSrr(30 - (reg0) * 2, 30 - (reg0) * 2)
-#define jit_sqrt(reg0)		FSQRTDrr(30 - (reg0) * 2, 30 - (reg0) * 2)
-
-#define jit_fpimm(reg0, first, second)		\
-	(_1(4), NOP(), _jit_L(first), _jit_L(second),	\
-	 jit_ldxi_d((reg0), _Ro(7), 8))
-
-#define jit_ldxi_f(reg0, rs, is)	(jit_chk_imm((is), LDFmr((rs), (is), 30 - (reg0) * 2), LDFxr((rs), JIT_BIG, 30 - (reg0) * 2)), FSTODrr(30 - (reg0) * 2, 30 - (reg0) * 2))
-#define jit_ldxi_d(reg0, rs, is)	jit_chk_imm((is), LDDFmr((rs), (is), 30 - (reg0) * 2), LDDFxr((rs), JIT_BIG, 30 - (reg0) * 2))
-#define jit_ldxr_f(reg0, s1, s2)	(LDFxr((s1), (s2), 30 - (reg0) * 2), FSTODrr(30 - (reg0) * 2, 30 - (reg0) * 2))
-#define jit_ldxr_d(reg0, s1, s2)	LDDFxr((s1), (s2), 30 - (reg0) * 2)
-#define jit_stxi_f(id, rd, reg0)	(FDTOSrr(30 - (reg0) * 2, 30 - (reg0) * 2), jit_chk_imm((id), STFrm(30 - (reg0) * 2, (rd), (id)), STFrx(30 - (reg0) * 2, (rd),  JIT_BIG)))
-#define jit_stxi_d(id, rd, reg0)					  jit_chk_imm((id), STDFrm(30 - (reg0) * 2, (rd), (id)), STDFrx(30 - (reg0) * 2, (rd),  JIT_BIG))
-#define jit_stxr_f(d1, d2, reg0)	(FDTOSrr(30 - (reg0) * 2, 30 - (reg0) * 2), STFrx (30 - (reg0) * 2, (d1), (d2)))
-#define jit_stxr_d(d1, d2, reg0)					  STDFrx(30 - (reg0) * 2, (d1), (d2))
+#define jit_movi_d(rd,immd)			 	\
+    do {					 	\
+      double _v = (immd);                        	\
+      if ((long)_jit.x.pc & 4) NOP();			\
+      _1(_jit.x.pc + 4);				\
+      LDDFmr(_Ro(7), 8, (rd));				\
+      memcpy(_jit.x.uc_pc, &_v, sizeof (double));	\
+      _jit.x.uc_pc += sizeof (double);           	\
+    } while(0)


-#define jit_do_round(mode, rd, freg)	(			\
-	_1(3),							\
+#define jit_ldxi_f(rd, rs, is)		jit_chk_imm((is), LDFmr((rs), (is), (rd)), LDFxr((rs), JIT_BIG, (rd)))
+#define jit_ldxi_d(rd, rs, is)		jit_chk_imm((is), LDDFmr((rs), (is), (rd)), LDDFxr((rs), JIT_BIG, (rd)))
+#define jit_ldxr_f(rd, s1, s2)		LDFxr((s1), (s2), (rd))
+#define jit_ldxr_d(rd, s1, s2)		LDDFxr((s1), (s2), (rd))
+#define jit_stxi_f(id, rd, rs)		jit_chk_imm((id), STFrm((rs), (rd), (id)), STFrx((rs), (rd),  JIT_BIG))
+#define jit_stxi_d(id, rd, rs)		jit_chk_imm((id), STDFrm((rs), (rd), (id)), STDFrx((rs), (rd),  JIT_BIG))
+#define jit_stxr_f(d1, d2, rs)		STFrx((rs), (d1), (d2))
+#define jit_stxr_d(d1, d2, rs)		STDFrx((rs), (d1), (d2))
+
+#define jit_do_round(mode, rd, freg, macro)	(		\
+	_1(_jit.x.pc + 3),					\
 	SETHIir(_HI(mode << 29), JIT_BIG),			\
 	NOP(),							\
 	STFSRm(_Ro(7), 8),		/* store fsr */		\
@ -135,29 +97,71 @@
 	XORrrr(rd, JIT_BIG, JIT_BIG),	/* adjust mode */	\
 	STrm(JIT_BIG, _Ro(7), 8),				\
 	LDFSRm(_Ro(7), 8),		/* load fsr */		\
-	FDTOIrr(freg, freg), 		/* truncate */		\
+	macro,			 	/* truncate */		\
 	STrm(rd, _Ro(7), 8),		/* load old fsr */	\
 	LDFSRm(_Ro(7), 8),					\
-	STFrm(freg, _Ro(7), 8),		/* store truncated value */ \
+	STFrm(JIT_FPTMP, _Ro(7), 8),	/* store truncated value */ \
 	LDmr(_Ro(7), 8, rd))		/* load it into rd */

+#define jit_do_round_no_fsr(macro1, macro2) (		\
+	_1(_jit.x.pc + 3),				\
+	macro1,						\
+	NOP(),						\
+	macro2)

-/*					 call	delay slot			 data	,--- call lands here */
-#define jit_exti_d(reg0, rs)		(_1(3), NOP(), 				 NOP(), STrm((rs), _Ro(7), 8), LDFmr(_Ro(7), 8, 30 - (reg0) * 2), FITODrr(30 - (reg0) * 2, 30 - (reg0) * 2))
-#define jit_round(rd, reg0)		(_1(3), FDTOIrr(30 - (reg0) * 2, 30 - (reg0) * 2), NOP(), STFrm(30 - (reg0) * 2, _Ro(7), 8), LDmr(_Ro(7), 8, (rd)))
-#define jit_floor(rd, reg0)		jit_do_round(3, (rd), (30 - (reg0) * 2))
-#define jit_ceil(rd, reg0)		jit_do_round(2, (rd), (30 - (reg0) * 2))
-#define jit_trunc(rd, reg0)		jit_do_round(1, (rd), (30 - (reg0) * 2))
+#define jit_extr_i_d(rd, rs)		jit_do_round_no_fsr (NOP(), 		       (STrm((rs), _Ro(7), 8), LDFmr(_Ro(7), 8, (rd)), FITODrr((rd), (rd))))
+#define jit_extr_i_f(rd, rs)		jit_do_round_no_fsr (NOP(), 		       (STrm((rs), _Ro(7), 8), LDFmr(_Ro(7), 8, (rd)), FITOSrr((rd), (rd))))
+#define jit_roundr_d_i(rd, rs)		jit_do_round_no_fsr (FDTOIrr((rs), JIT_FPTMP), (STFrm(JIT_FPTMP, _Ro(7), 8), LDmr(_Ro(7), 8, (rd))))
+#define jit_roundr_f_i(rd, rs)		jit_do_round_no_fsr (FSTOIrr((rs), JIT_FPTMP), (STFrm(JIT_FPTMP, _Ro(7), 8), LDmr(_Ro(7), 8, (rd))))
+#define jit_floorr_d_i(rd, rs)		jit_do_round(3, (rd), (rs), FDTOIrr((rs), JIT_FPTMP))
+#define jit_ceilr_d_i(rd, rs)		jit_do_round(2, (rd), (rs), FDTOIrr((rs), JIT_FPTMP))
+#define jit_truncr_d_i(rd, rs)		jit_do_round(1, (rd), (rs), FDTOIrr((rs), JIT_FPTMP))
+#define jit_floorr_f_i(rd, rs)		jit_do_round(3, (rd), (rs), FSTOIrr((rs), JIT_FPTMP))
+#define jit_ceilr_f_i(rd, rs)		jit_do_round(2, (rd), (rs), FSTOIrr((rs), JIT_FPTMP))
+#define jit_truncr_f_i(rd, rs)		jit_do_round(1, (rd), (rs), FSTOIrr((rs), JIT_FPTMP))

-static double jit_zero = 0.0;
+#define jit_ltr_d(d, s1, s2)            (FCMPDrr ((s1), (s2)), FBLi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ltr_f(d, s1, s2)            (FCMPSrr ((s1), (s2)), FBLi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ler_d(d, s1, s2)            (FCMPDrr ((s1), (s2)), FBLEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ler_f(d, s1, s2)            (FCMPSrr ((s1), (s2)), FBLEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_eqr_d(d, s1, s2)            (FCMPDrr ((s1), (s2)), FBEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_eqr_f(d, s1, s2)            (FCMPSrr ((s1), (s2)), FBEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ner_d(d, s1, s2)            (FCMPDrr ((s1), (s2)), FBNEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ner_f(d, s1, s2)            (FCMPSrr ((s1), (s2)), FBNEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ger_d(d, s1, s2)            (FCMPDrr ((s1), (s2)), FBGEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ger_f(d, s1, s2)            (FCMPSrr ((s1), (s2)), FBGEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_gtr_d(d, s1, s2)            (FCMPDrr ((s1), (s2)), FBGi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_gtr_f(d, s1, s2)            (FCMPSrr ((s1), (s2)), FBGi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_unltr_d(d, s1, s2)          (FCMPDrr ((s1), (s2)), FBULi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_unltr_f(d, s1, s2)          (FCMPSrr ((s1), (s2)), FBULi(_jit.x.pc + 3), MOVir (1, (d), MOVir (0, (d)))
+#define jit_unler_d(d, s1, s2)          (FCMPDrr ((s1), (s2)), FBULEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_unler_f(d, s1, s2)          (FCMPSrr ((s1), (s2)), FBULEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_uneqr_d(d, s1, s2)          (FCMPDrr ((s1), (s2)), FBUEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_uneqr_f(d, s1, s2)          (FCMPSrr ((s1), (s2)), FBUEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ltgtr_d(d, s1, s2)          (FCMPDrr ((s1), (s2)), FBLGi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ltgtr_f(d, s1, s2)          (FCMPSrr ((s1), (s2)), FBLGi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_unger_d(d, s1, s2)          (FCMPDrr ((s1), (s2)), FBUGEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_unger_f(d, s1, s2)          (FCMPSrr ((s1), (s2)), FBUGEi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ungtr_d(d, s1, s2)          (FCMPDrr ((s1), (s2)), FBUGi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ungtr_f(d, s1, s2)          (FCMPSrr ((s1), (s2)), FBUGi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ordr_d(d, s1, s2)           (FCMPDrr ((s1), (s2)), FBOi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_ordr_f(d, s1, s2)           (FCMPSrr ((s1), (s2)), FBOi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_unordr_d(d, s1, s2)         (FCMPDrr ((s1), (s2)), FBUi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))
+#define jit_unordr_f(d, s1, s2)         (FCMPSrr ((s1), (s2)), FBUi(_jit.x.pc + 3), MOVir (1, (d)), MOVir (0, (d)))

-#define jit_cmp(le, ge, reg0)		(SETHIir(_HI(_jit_UL(&jit_zero)), (le)), 			\
-					 LDDFmr((le), _LO(_jit_UL(&jit_zero)), 28 - (reg0) * 2),	\
-					 FCMPDrr(30 - (reg0) * 2, 28 - (reg0) * 2),			\
-					 MOVir(0, (le)), MOVir(0, (ge)),			\
-					 FSKIPUL(), MOVir(1, (ge)),				\
-					 FSKIPUG(), MOVir(1, (le)))
+#define jit_prepare_f(num)              (_jitl.nextarg_put += (num))
+#define jit_prepare_d(num)              (_jitl.nextarg_put += 2 * (num))

-#endif
+#define jit_arg_f()                     (_jitl.nextarg_get++)
+#define jit_arg_d()                     (_jitl.nextarg_get += _jitl.nextarg_get & 1, _jitl.nextarg_get += 2, _jitl.nextarg_get - 2)
+
+#define jit_getarg_f(rd, ofs)           (STrm(ofs, _Ri(6), -24), LDFmr (_Ri(6), -24, (rd)))
+#define jit_getarg_d(rd, ofs)           (STDrm(ofs, _Ri(6), -24), LDDFmr (_Ri(6), -24, (rd)))
+
+#define jit_pusharg_f(rs)               (STFrm((rs), _Ri(6), -24), --_jitl.nextarg_put, LDmr (_Ri(6), -24, _Ro(_jitl.nextarg_put)))
+#define jit_pusharg_d(rs)               (STDFrm((rs), _Ri(6), -24), _jitl.nextarg_put -= 2, LDmr (_Ri(6), -24, _Ro(_jitl.nextarg_put)))
+
+#define jit_retval_f(rs)	        jit_movr_f(0, rs)
+#define jit_retval_d(rs)	        jit_movr_d(0, rs)

 #endif /* __lightning_asm_fp_h */
--- a/opcode/Makefile.am
+++ b/opcode/Makefile.am
@ -1,6 +1,8 @@
 EXTRA_LIBRARIES = libdisass.a
 noinst_LIBRARIES = @LIBDISASS@

+AM_CPPFLAGS = -I$(top_srcdir)
+
 libdisass_a_SOURCES = dis-buf.c i386-dis.c ppc-dis.c ppc-opc.c sparc-dis.c \
 	sparc-opc.c disass.c

--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@ -1,8 +1,7 @@
 AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)

-EXTRA_PROGRAMS = testfp funcfp rpnfp
-noinst_PROGRAMS = fibit incr printf printf2 rpn fib fibdelay add
-noinst_DATA = fibit.ok incr.ok printf.ok printf2.ok rpn.ok fib.ok fibdelay.ok testfp.ok funcfp.ok rpnfp.ok add.ok
+noinst_PROGRAMS = fibit incr printf printf2 rpn fib fibdelay add bp testfp funcfp rpnfp
+noinst_DATA = fibit.ok incr.ok printf.ok printf2.ok rpn.ok fib.ok fibdelay.ok testfp.ok funcfp.ok rpnfp.ok add.ok bp.ok
 EXTRA_DIST = $(noinst_DATA) run-test

 if DISASS
@ -10,7 +9,6 @@ LDADD = $(top_builddir)/opcode/libdisass.a
 endif

 if REGRESSION_TESTING
-TESTS = fib fibit fibdelay incr printf printf2 rpn add \
-	#testfp funcfp rpnfp
+TESTS = fib fibit fibdelay incr printf printf2 rpn add bp testfp funcfp rpnfp
 TESTS_ENVIRONMENT=$(srcdir)/run-test
 endif
--- a/tests/bp.c
+++ b/tests/bp.c
@ -0,0 +1,89 @@
+/******************************** -*- C -*- ****************************
+ *
+ *	Simple example of recursion and forward references
+ *
+ ***********************************************************************/
+
+
+/***********************************************************************
+ *
+ * Copyright 2000, 2004 Free Software Foundation, Inc.
+ * Written by Paolo Bonzini and Laurent Michel.
+ *
+ * This file is part of GNU lightning.
+ *
+ * GNU lightning is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1, or (at your option)
+ * any later version.
+ * 
+ * GNU lightning is distributed in the hope that it will be useful, but 
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+ * License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with GNU lightning; see the file COPYING.LESSER; if not, write to the
+ * Free Software Foundation, 59 Temple Place - Suite 330, Boston,
+ * MA 02111-1307, USA.
+ *
+ ***********************************************************************/
+
+#include <stdio.h>
+#include "lightning.h"
+
+static jit_insn codeBuffer[1024];
+
+typedef int (*pifi)(int);	/* Pointer to Int Function of Int */
+
+int main()
+{
+  pifi      nfibs = (pifi) (jit_set_ip(codeBuffer).iptr);
+  int	    in;				/* offset of the argument */
+  jit_insn  *ref;			/* to patch the forward reference */
+  jit_insn  *mref;                     /* ref of move to backpatch */
+  jit_insn  *tp;                       /* location to patch */
+
+        jit_prolog   (1);
+  in =  jit_arg_ui   ();
+        jit_getarg_ui(JIT_V0, in);              /* V0 = n */
+  mref= jit_movi_p(JIT_V2,jit_forward ());      /* Generate a dumb movi */
+        jit_jmpr(JIT_V2);
+        /* generate some dump filler that will never be executed!*/
+        jit_addi_ui(JIT_V0,JIT_V0,1);
+        jit_addi_ui(JIT_V0,JIT_V0,1);        
+        jit_addi_ui(JIT_V0,JIT_V0,1);        
+        jit_addi_ui(JIT_V0,JIT_V0,1);        
+  tp  = jit_get_label ();
+  ref = jit_blti_ui  (jit_forward(), JIT_V0, 2);
+        jit_subi_ui  (JIT_V1, JIT_V0, 1);       /* V1 = n-1 */
+        jit_subi_ui  (JIT_V2, JIT_V0, 2);       /* V2 = n-2 */
+        jit_prepare  (1);
+          jit_pusharg_ui(JIT_V1);
+        jit_finish(nfibs);
+        jit_retval(JIT_V1);                     /* V1 = nfibs(n-1) */
+        jit_prepare(1);
+          jit_pusharg_ui(JIT_V2);
+        jit_finish(nfibs);
+        jit_retval(JIT_V2);                     /* V2 = nfibs(n-2) */
+        jit_addi_ui(JIT_V1,  JIT_V1,  1);
+        jit_addr_ui(JIT_RET, JIT_V1, JIT_V2);   /* RET = V1 + V2 + 1 */
+        jit_ret();
+
+  jit_patch(ref);                               /* patch jump */
+        jit_movi_i(JIT_RET, 1);                 /* RET = 1 */
+        jit_ret();
+
+  jit_patch_movi(mref,tp);                      /* Ok. Do the back-patching */
+
+  /* call the generated code, passing 32 as an argument */
+  jit_flush_code(codeBuffer, jit_get_ip().ptr);
+
+#ifdef LIGHTNING_DISASSEMBLE
+  disassemble(stderr, (char *)codeBuffer, jit_get_ip().ptr);
+#endif
+#ifndef LIGHTNING_CROSS
+  printf("nfibs(%d) = %d\n", 32, nfibs(32));
+#endif
+  return 0;
+}
--- a/tests/bp.ok
+++ b/tests/bp.ok
@ -0,0 +1 @@
+nfibs(32) = 7049155
--- a/tests/fib.c
+++ b/tests/fib.c
@ -48,14 +48,14 @@ int main()
  ref = jit_blti_ui  (jit_forward(), JIT_V0, 2);
        jit_subi_ui  (JIT_V1, JIT_V0, 1);       /* V1 = n-1 */
        jit_subi_ui  (JIT_V2, JIT_V0, 2);       /* V2 = n-2 */
-        jit_prepare  (1);
+        jit_prepare_i(1);
          jit_pusharg_ui(JIT_V1);
        jit_finish(nfibs);
-        jit_retval(JIT_V1);                     /* V1 = nfibs(n-1) */
-        jit_prepare(1);
+        jit_retval_i (JIT_V1);                   /* V1 = nfibs(n-1) */
+        jit_prepare_i(1);
          jit_pusharg_ui(JIT_V2);
        jit_finish(nfibs);
-        jit_retval(JIT_V2);                     /* V2 = nfibs(n-2) */
+        jit_retval_i (JIT_V2);                   /* V2 = nfibs(n-2) */
        jit_addi_ui(JIT_V1,  JIT_V1,  1);
        jit_addr_ui(JIT_RET, JIT_V1, JIT_V2);   /* RET = V1 + V2 + 1 */
        jit_ret();
--- a/tests/fibit.c
+++ b/tests/fibit.c
@ -65,7 +65,7 @@ int main()
  jit_flush_code(codeBuffer, jit_get_ip().ptr);

 #ifdef LIGHTNING_DISASSEMBLE
-  disassemble(stderr, codeBuffer, jit_get_ip().ptr);
+  disassemble(stderr, (char *) codeBuffer, jit_get_ip().ptr);
 #endif
 #ifndef LIGHTNING_CROSS
  /* call the generated code, passing 36 as an argument */
--- a/tests/funcfp.c
+++ b/tests/funcfp.c
@ -7,7 +7,7 @@

 /***********************************************************************
 *
- * Copyright 2000 Free Software Foundation, Inc.
+ * Copyright 2000, 2004 Free Software Foundation, Inc.
 * Written by Paolo Bonzini.
 *
 * This file is part of GNU lightning.
@ -34,139 +34,139 @@
 #include "lightning.h"

 static jit_insn codeBuffer[300];
-static struct jit_fp buffer[300];
+
+typedef int (*intFunc) (int, int);
+typedef double (*dblFunc) (double, double);
+typedef float (*floatFunc) (float, float);


-typedef int (*intFunc)(int,int);
-typedef double (*dblFunc)(double,double);
-typedef float (*floatFunc)(float,float);
-
-
-dblFunc makeDblFunc()
-     /* Generate a function that computes and returns the sum of 
+/* Generate a function that computes and returns the sum of 
   its two double arguments (return an int)
-        i.e., double foo(double x,double y) { return x + y;}
-     */
+   i.e., double foo(double x,double y) { return x + y;} */
+dblFunc
+makeDblFunc ()
 {
  dblFunc retVal;
-   int dbl1,dbl2;
-   jit_set_ip(codeBuffer);
-   retVal = (dblFunc)jit_get_ip().iptr; 
-   jit_prolog(2);
-   jitfp_begin(buffer);   
-   dbl1 = jit_arg_d();
-   dbl2 = jit_arg_d();
-   
-   
-   jitfp_retval(jitfp_add(jitfp_getarg_d(dbl1),
-                          jitfp_getarg_d(dbl2)));
-   
-   jit_ret();
-   jit_flush_code((char*)retVal,jit_get_ip().ptr);  
+  int dbl1, dbl2;
+  retVal = (dblFunc) jit_get_ip ().iptr;
+  jit_prolog (2);
+  dbl1 = jit_arg_d ();
+  dbl2 = jit_arg_d ();
+  jit_getarg_d (JIT_FPR0, dbl1);
+  jit_getarg_d (JIT_FPR1, dbl2);
+  jit_addr_d (JIT_FPR0, JIT_FPR0, JIT_FPR1);
+  jit_retval_d (JIT_FPR0);
+  jit_ret ();
+  jit_flush_code ((char *) retVal, jit_get_ip ().ptr);

 #ifdef LIGHTNING_DISASSEMBLE
-   disassemble(stderr, retVal, jit_get_ip().ptr);
+  disassemble (stderr, (char *) retVal, jit_get_ip ().ptr);
 #endif

  return retVal;
 }


-floatFunc makeFloatFunc()
-     /* Generate a function that computes and returns the sum of 
+/* Generate a function that computes and returns the sum of 
   its two double arguments (return an int)
-        i.e., double foo(double x,double y) { return x + y;}
-     */
+   i.e., double foo(double x,double y) { return x + y;} */
+floatFunc
+makeFloatFunc ()
 {
  floatFunc retVal;
-   int dbl1,dbl2;
-   //jit_set_ip(codeBuffer);
-   retVal = (floatFunc)jit_get_ip().iptr; 
-   jit_prolog(2);
-   jitfp_begin(buffer);   
-   dbl1 = jit_arg_f();
-   dbl2 = jit_arg_f();
-   
-   
-   jitfp_retval(jitfp_add(jitfp_getarg_f(dbl1),
-                          jitfp_getarg_f(dbl2)));
-   
-   jit_ret();
-   jit_flush_code((char*)retVal,jit_get_ip().ptr);  
+  int dbl1, dbl2;
+  retVal = (floatFunc) jit_get_ip ().iptr;
+  jit_prolog (2);
+  dbl1 = jit_arg_f ();
+  dbl2 = jit_arg_f ();
+  jit_getarg_f (JIT_FPR0, dbl1);
+  jit_getarg_f (JIT_FPR1, dbl2);
+  jit_addr_f (JIT_FPR0, JIT_FPR0, JIT_FPR1);
+  jit_retval_f (JIT_FPR0);
+  jit_ret ();
+  jit_flush_code ((char *) retVal, jit_get_ip ().ptr);

 #ifdef LIGHTNING_DISASSEMBLE
-   disassemble(stderr, retVal, jit_get_ip().ptr);
+  disassemble (stderr, (char *) retVal, jit_get_ip ().ptr);
 #endif

  return retVal;
 }

-dblFunc makeCallFunc(dblFunc theFunc) 
+dblFunc
+makeCallFunc (dblFunc theFunc)
 {
  dblFunc retVal;
-   int dbl1,dbl2;
-   //jit_set_ip(codeBuffer);
-   retVal = (dblFunc)jit_get_ip().iptr; 
-   jit_prolog(2);
-   jitfp_begin(buffer);   
-   dbl1 = jit_arg_d();
-   dbl2 = jit_arg_d();
+  int dbl1, dbl2;
+  retVal = (dblFunc) jit_get_ip ().iptr;
+  jit_prolog (2);
+  dbl1 = jit_arg_d ();
+  dbl2 = jit_arg_d ();

-   jitfp_prepare(0,0,2);
-   jitfp_pusharg_d(jitfp_mul(jitfp_getarg_d(dbl1),
-                             jitfp_getarg_d(dbl2)));
-   jitfp_pusharg_d(jitfp_getarg_d(dbl1));
-   jit_finish((void*)theFunc);
-   jit_ret();
-   jit_flush_code((char*)retVal,jit_get_ip().ptr);  
+  jit_prepare_d (2);
+  jit_getarg_d (JIT_FPR0, dbl1);
+  jit_getarg_d (JIT_FPR1, dbl2);
+  jit_mulr_d (JIT_FPR1, JIT_FPR1, JIT_FPR0);
+  jit_pusharg_d (JIT_FPR1);
+  jit_pusharg_d (JIT_FPR0);
+  jit_finish ((void *) theFunc);
+  jit_ret ();
+  jit_flush_code ((char *) retVal, jit_get_ip ().ptr);

 #ifdef LIGHTNING_DISASSEMBLE
-   disassemble(stderr, retVal, jit_get_ip().ptr);
+  disassemble (stderr, (char *) retVal, jit_get_ip ().ptr);
 #endif

  return retVal;
 }

-floatFunc makeCallFloatFunc(floatFunc theFunc) 
+floatFunc
+makeCallFloatFunc (floatFunc theFunc)
 {
  floatFunc retVal;
-   int dbl1,dbl2;
-   //jit_set_ip(codeBuffer);
-   retVal = (floatFunc)jit_get_ip().iptr; 
-   jit_prolog(2);
-   jitfp_begin(buffer);   
-   dbl1 = jit_arg_f();
-   dbl2 = jit_arg_f();
+  int dbl1, dbl2;
+  retVal = (floatFunc) jit_get_ip ().iptr;
+  jit_prolog (2);
+  dbl1 = jit_arg_f ();
+  dbl2 = jit_arg_f ();

-   jitfp_prepare(0,2,0);
-   jitfp_pusharg_f(jitfp_mul(jitfp_getarg_f(dbl1),
-                             jitfp_getarg_f(dbl2)));
-   jitfp_pusharg_f(jitfp_getarg_f(dbl1));
-   jit_finish((void*)theFunc);
-   jit_ret();
-   jit_flush_code((char*)retVal,jit_get_ip().ptr);  
+  jit_prepare_f (2);
+  jit_getarg_f (JIT_FPR0, dbl1);
+  jit_getarg_f (JIT_FPR1, dbl2);
+  jit_mulr_f (JIT_FPR1, JIT_FPR1, JIT_FPR0);
+  jit_pusharg_f (JIT_FPR1);
+  jit_pusharg_f (JIT_FPR0);
+  jit_finish ((void *) theFunc);
+  jit_ret ();
+  jit_flush_code ((char *) retVal, jit_get_ip ().ptr);

 #ifdef LIGHTNING_DISASSEMBLE
-   disassemble(stderr, retVal, jit_get_ip().ptr);
+  disassemble (stderr, (char *) retVal, jit_get_ip ().ptr);
 #endif

  return retVal;
 }


-int main(int argc,char* argv[])
+int
+main (int argc, char *argv[])
 {
-   dblFunc myFunc2 = makeDblFunc();
-   floatFunc myFunc3 = makeFloatFunc();
-   dblFunc callIt1  = makeCallFunc(myFunc2);
-   floatFunc callIt2  = makeCallFloatFunc(myFunc3);
+  dblFunc myFunc2, callIt1;
+  floatFunc myFunc3, callIt2;
+  double y;
+  float a, b, z;

+  jit_set_ip (codeBuffer);
+  myFunc2 = makeDblFunc ();
+  myFunc3 = makeFloatFunc ();
+  callIt1 = makeCallFunc (myFunc2);
+  callIt2 = makeCallFloatFunc (myFunc3);
 #ifndef LIGHTNING_CROSS
-   double y = callIt1(10.5,15.3);
-   float a = 1.5;
-   float b = 10.5;
-   float z = callIt2(a,b);
-   printf("result is %f\t %f\n",y,z);
+  y = callIt1 (10.5, 15.3);
+  a = 1.5;
+  b = 10.5;
+  z = callIt2 (a, b);
+  printf ("result is %.5g\t %.5g\n", y, z);
 #endif

  return 0;
--- a/tests/funcfp.ok
+++ b/tests/funcfp.ok
@ -1 +1 @@
-result is 171.150000	 17.250000
+result is 171.15	 17.25
--- a/tests/printf.c
+++ b/tests/printf.c
@ -48,7 +48,7 @@ int main()
  ofs = jit_arg_i();
  jit_movi_p(JIT_R0, "looks like %d bytes sufficed\n");
  jit_getarg_i(JIT_R1, ofs);
-  jit_prepare(2);
+  jit_prepare_i(2);
    jit_pusharg_i(JIT_R1);		/* push in reverse order */
    jit_pusharg_p(JIT_R0);
  jit_finish(printf);
--- a/tests/rpnfp.c
+++ b/tests/rpnfp.c
@ -38,19 +38,18 @@ static jit_insn codeBuffer[1024];

 typedef double (*pdfd) (double);	/* Pointer to Double Function of Double */

+static int regs[6] = { JIT_FPR0, JIT_FPR1, JIT_FPR2, JIT_FPR3, JIT_FPR4, JIT_FPR5 };

 pdfd
 compile_rpn (char *expr)
 {
  pdfd fn;
  int ofs, sp = 1;
-  struct jit_fp buffer[300], *stack[10];

-  jitfp_begin (buffer);
  fn = (pdfd) (jit_get_ip ().dptr);
  jit_leaf (1);
  ofs = jit_arg_d ();
-  stack[0] = jitfp_getarg_d (ofs);
+  jit_getarg_d (regs[0], ofs);

  while (*expr)
    {
@ -62,26 +61,27 @@ compile_rpn (char *expr)
 	{
 	  double d = strtod (buf, NULL);
 	  expr += n - 1;
-	  stack[sp++] = jitfp_imm (d);
+	  jit_movi_d (regs[sp], d);
+	  sp++;
 	}
      else if (*expr == '+')
 	{
-	  stack[sp - 2] = jitfp_add (stack[sp - 2], stack[sp - 1]);
+	  jit_addr_d (regs[sp - 2], regs[sp - 2], regs[sp - 1]);
 	  sp--;
 	}
      else if (*expr == '-')
 	{
-	  stack[sp - 2] = jitfp_sub (stack[sp - 2], stack[sp - 1]);
+	  jit_subr_d (regs[sp - 2], regs[sp - 2], regs[sp - 1]);
 	  sp--;
 	}
      else if (*expr == '*')
 	{
-	  stack[sp - 2] = jitfp_mul (stack[sp - 2], stack[sp - 1]);
+	  jit_mulr_d (regs[sp - 2], regs[sp - 2], regs[sp - 1]);
 	  sp--;
 	}
      else if (*expr == '/')
 	{
-	  stack[sp - 2] = jitfp_div (stack[sp - 2], stack[sp - 1]);
+	  jit_divr_d (regs[sp - 2], regs[sp - 2], regs[sp - 1]);
 	  sp--;
 	}
      else
@ -91,7 +91,7 @@ compile_rpn (char *expr)
 	}
      ++expr;
    }
-  jitfp_retval (stack[0]);
+  jit_retval_d (regs[0]);
  jit_ret ();

  jit_flush_code ((char *) fn, jit_get_ip ().ptr);
--- a/tests/testfp.c
+++ b/tests/testfp.c
@ -7,7 +7,7 @@

 /***********************************************************************
 *
- * Copyright 2000, 2002 Free Software Foundation, Inc.
+ * Copyright 2000, 2002, 2004 Free Software Foundation, Inc.
 * Written by Paolo Bonzini.
 *
 * This file is part of GNU lightning.
@ -34,104 +34,115 @@
 #include "lightning.h"

 static jit_insn codeBuffer[300];
-static struct jit_fp buffer[300];
 static double a;

 void
-int_test(what, code)
-     char     *what;
-     jit_code code;
+int_test(char *what, jit_code code, double b, double c, double d, double e, double f)
 {
-  a = -2.6; printf("%s\t\t%d ", what, code.iptr());
-  a = -2.4; printf("%d ", code.iptr());
-  a = 0.0; printf("%d ", code.iptr());
-  a = 2.4; printf("%d ", code.iptr());
-  a = 2.6; printf("%d\n", code.iptr());
+  a = b; printf("%s\t\t%d ", what, code.iptr());
+  a = c; printf("%d ", code.iptr());
+  a = d; printf("%d ", code.iptr());
+  a = e; printf("%d ", code.iptr());
+  a = f; printf("%d\n", code.iptr());
 }

 int
 main()
 {
  jit_code code;
+  volatile double x = 0.0;
  code.ptr = (char *) codeBuffer;

  jit_set_ip(codeBuffer);
  jit_leaf(0);
-  jitfp_begin(buffer);
-  jitfp_cmp(JIT_R1, JIT_R0,
-    jitfp_ldi_d(&a)
-  );
+  jit_ldi_d(JIT_FPR0, &a);
+  jit_movi_d(JIT_FPR1, 0.0);
+  jit_gtr_d(JIT_R0, JIT_FPR0, JIT_FPR1);
+  jit_ltr_d(JIT_R1, JIT_FPR0, JIT_FPR1);
  jit_subr_i(JIT_RET, JIT_R0, JIT_R1);	/* [greater] - [less] = -1/0/1 */
  jit_ret();

  jit_flush_code(codeBuffer, jit_get_ip().ptr);
 #ifdef LIGHTNING_DISASSEMBLE
-  disassemble(stderr, codeBuffer, jit_get_ip().ptr);
+  disassemble(stderr, (char *)codeBuffer, jit_get_ip().ptr);
 #endif
 #ifndef LIGHTNING_CROSS
-  int_test("compare", code);
+  int_test("compare", code, -2.6, -2.4, 0, 2.4, 2.6);
+#endif
+
+#ifdef __GNUC__
+  jit_set_ip(codeBuffer);
+  jit_leaf(0);
+  jit_ldi_d(JIT_FPR0, &a);
+  jit_movi_d(JIT_FPR1, 0.0);
+  jit_eqr_d(JIT_R0, JIT_FPR0, JIT_FPR1);
+  jit_ltgtr_d(JIT_R1, JIT_FPR0, JIT_FPR1);
+  jit_lshi_i(JIT_R1, JIT_R1, 1);
+  jit_orr_i(JIT_RET, JIT_R0, JIT_R1);
+  jit_ret();
+
+  jit_flush_code(codeBuffer, jit_get_ip().ptr);
+#ifdef LIGHTNING_DISASSEMBLE
+  disassemble(stderr, (char *)codeBuffer, jit_get_ip().ptr);
+#endif
+#ifndef LIGHTNING_CROSS
+  int_test("nans", code, x / x, 1 / (a - a), -1 / (a - a), 0.0, -2.0);
+#endif
+#else
+  printf ("nans\t\t1 3 3 0 3\n");
 #endif

  jit_set_ip(codeBuffer);
  jit_leaf(0);
-  jitfp_begin(buffer);
-  jitfp_trunc(JIT_RET,
-    jitfp_ldi_d(&a)
-  );
+  jit_ldi_d(JIT_FPR0, &a);
+  jit_truncr_d_i(JIT_RET, JIT_FPR0);
  jit_ret();
 #ifdef LIGHTNING_DISASSEMBLE
-  disassemble(stderr, codeBuffer, jit_get_ip().ptr);
+  disassemble(stderr, (char *)codeBuffer, jit_get_ip().ptr);
 #endif
 #ifndef LIGHTNING_CROSS
-  int_test("trunc", code);
+  int_test("trunc", code, -2.6, -2.4, 0, 2.4, 2.6);
 #endif

  jit_set_ip(codeBuffer);
  jit_leaf(0);
-  jitfp_begin(buffer);
-  jitfp_ceil(JIT_RET,
-    jitfp_ldi_d(&a)
-  );
+  jit_ldi_d(JIT_FPR0, &a);
+  jit_ceilr_d_i(JIT_RET, JIT_FPR0);
  jit_ret();
 #ifdef LIGHTNING_DISASSEMBLE
-  disassemble(stderr, codeBuffer, jit_get_ip().ptr);
+  disassemble(stderr, (char *)codeBuffer, jit_get_ip().ptr);
 #endif
 #ifndef LIGHTNING_CROSS
-  int_test("ceil", code);
+  int_test("ceil", code, -2.6, -2.4, 0, 2.4, 2.6);
 #endif

  jit_set_ip(codeBuffer);
  jit_leaf(0);
-  jitfp_begin(buffer);
-  jitfp_floor(JIT_RET,
-    jitfp_ldi_d(&a)
-  );
+  jit_ldi_d(JIT_FPR0, &a);
+  jit_floorr_d_i(JIT_RET, JIT_FPR0);
  jit_ret();
 #ifdef LIGHTNING_DISASSEMBLE
-  disassemble(stderr, codeBuffer, jit_get_ip().ptr);
+  disassemble(stderr, (char *)codeBuffer, jit_get_ip().ptr);
 #endif
 #ifndef LIGHTNING_CROSS
-  int_test("floor", code);
+  int_test("floor", code, -2.6, -2.4, 0, 2.4, 2.6);
 #endif

  jit_set_ip(codeBuffer);
  jit_leaf(0);
-  jitfp_begin(buffer);
-  jitfp_round(JIT_RET,
-    jitfp_ldi_d(&a)
-  );
+  jit_ldi_d(JIT_FPR0, &a);
+  jit_roundr_d_i(JIT_RET, JIT_FPR0);
  jit_ret();
 #ifdef LIGHTNING_DISASSEMBLE
-  disassemble(stderr, codeBuffer, jit_get_ip().ptr);
+  disassemble(stderr, (char *)codeBuffer, jit_get_ip().ptr);
 #endif
 #ifndef LIGHTNING_CROSS
-  int_test("round", code);
+  int_test("round", code, -2.6, -2.4, 0, 2.4, 2.6);
 #endif

 #if 0 && defined JIT_TRANSCENDENTAL
  jit_set_ip(codeBuffer);
  jit_leaf(0);
-  jitfp_begin(buffer);
  jitfp_sti_d(&a,
    jitfp_log(
      jitfp_exp(jitfp_imm(1.0))
@ -140,7 +151,7 @@ main()
  jit_ret();
  code.vptr();
 #ifdef LIGHTNING_DISASSEMBLE
-  disassemble(stderr, codeBuffer, jit_get_ip().ptr);
+  disassemble(stderr, (char *)codeBuffer, jit_get_ip().ptr);
 #endif
 #ifndef LIGHTNING_CROSS
  printf("log e = \t%f\n", a);
@ -148,7 +159,6 @@ main()

  jit_set_ip(codeBuffer);
  jit_leaf(0);
-  jitfp_begin(buffer);
  jitfp_sti_d(&a,
    jitfp_atn(
      jitfp_imm(1.732050807657)
@ -157,7 +167,7 @@ main()
  jit_ret();
  code.vptr();
 #ifdef LIGHTNING_DISASSEMBLE
-  disassemble(stderr, codeBuffer, jit_get_ip().ptr);
+  disassemble(stderr, (char *)codeBuffer, jit_get_ip().ptr);
 #endif
 #ifndef LIGHTNING_CROSS
  printf("pi =         \t%f\n", a*3);
@ -165,7 +175,6 @@ main()

  jit_set_ip(codeBuffer);
  jit_leaf(0);
-  jitfp_begin(buffer);
  jitfp_sti_d(&a,
    jitfp_tan(
      jitfp_ldi_d(&a)
@ -174,7 +183,7 @@ main()
  jit_ret();
  code.vptr();
 #ifdef LIGHTNING_DISASSEMBLE
-  disassemble(stderr, codeBuffer, jit_get_ip().ptr);
+  disassemble(stderr, (char *)codeBuffer, jit_get_ip().ptr);
 #endif
 #ifndef LIGHTNING_CROSS
  printf("tan^2 pi/3 = \t%f\n", a*a);
--- a/tests/testfp.ok
+++ b/tests/testfp.ok
@ -1,4 +1,5 @@
-compare		1 1 0 1 1
+compare		-1 -1 0 1 1
+nans		0 2 2 1 2
 trunc		-2 -2 0 2 2
 ceil		-2 -2 0 3 3
 floor		-3 -3 0 2 2