summaryrefslogtreecommitdiffstats
path: root/cpukit/score/cpu/i386
diff options
context:
space:
mode:
authorTill Straumann <strauman@slac.stanford.edu>2009-11-10 06:55:28 +0000
committerTill Straumann <strauman@slac.stanford.edu>2009-11-10 06:55:28 +0000
commitb02f4cc11f03d3425e3ce83daaa1d903c937b5b3 (patch)
tree14b52f551fdcb427f9b3e0855c11914420349871 /cpukit/score/cpu/i386
parent2009-11-10 Till Straumann <strauman@slac.stanford.edu> (diff)
downloadrtems-b02f4cc11f03d3425e3ce83daaa1d903c937b5b3.tar.bz2
2009-11-09 Till Straumann <strauman@slac.stanford.edu>
* cpu.c, cpu_asm.S, rtems/score/cpu.h, sse_test.c: Added experimental SSE support.
Diffstat (limited to 'cpukit/score/cpu/i386')
-rw-r--r--cpukit/score/cpu/i386/cpu.c33
-rw-r--r--cpukit/score/cpu/i386/cpu_asm.S34
-rw-r--r--cpukit/score/cpu/i386/rtems/score/cpu.h117
-rw-r--r--cpukit/score/cpu/i386/sse_test.c951
4 files changed, 1134 insertions, 1 deletions
diff --git a/cpukit/score/cpu/i386/cpu.c b/cpukit/score/cpu/i386/cpu.c
index b150ad8f8d..e9c2dc64e0 100644
--- a/cpukit/score/cpu/i386/cpu.c
+++ b/cpukit/score/cpu/i386/cpu.c
@@ -56,9 +56,36 @@ void _CPU_Initialize(void)
fp_context = &_CPU_Null_fp_context;
+#ifdef __SSE__
+ asm volatile( "fstcw %0":"=m"(fp_context->fpucw) );
+#else
asm volatile( "fsave (%0)" : "=r" (fp_context)
: "0" (fp_context)
);
+#endif
+ }
+#endif
+
+#ifdef __SSE__
+
+ asm volatile("stmxcsr %0":"=m"(fp_context->mxcsr));
+
+ /* The BSP must enable the SSE extensions (early).
+ * If any SSE instruction was already attempted
+ * then that crashed the system.
+ * As a courtesy, we double-check here but it
+ * may be too late (which is also why we don't
+ * enable SSE here).
+ */
+ {
+ uint32_t cr4;
+ __asm__ __volatile__("mov %%cr4, %0":"=r"(cr4));
+ if ( 0x600 != (cr4 & 0x600) ) {
+ printk("PANIC: RTEMS was compiled for SSE but BSP did not enable it (CR4: 0x%08x)\n", cr4);
+ while ( 1 ) {
+ __asm__ __volatile__("hlt");
+ }
+ }
}
#endif
}
@@ -165,6 +192,9 @@ extern void rtems_exception_prologue_14(void);
extern void rtems_exception_prologue_16(void);
extern void rtems_exception_prologue_17(void);
extern void rtems_exception_prologue_18(void);
+#ifdef __SSE__
+extern void rtems_exception_prologue_19(void);
+#endif
static rtems_raw_irq_hdl tbl[] = {
rtems_exception_prologue_0,
@@ -186,6 +216,9 @@ static rtems_raw_irq_hdl tbl[] = {
rtems_exception_prologue_16,
rtems_exception_prologue_17,
rtems_exception_prologue_18,
+#ifdef __SSE__
+ rtems_exception_prologue_19,
+#endif
};
void rtems_exception_init_mngt(void)
diff --git a/cpukit/score/cpu/i386/cpu_asm.S b/cpukit/score/cpu/i386/cpu_asm.S
index 4084e03fb8..20b9949a06 100644
--- a/cpukit/score/cpu/i386/cpu_asm.S
+++ b/cpukit/score/cpu/i386/cpu_asm.S
@@ -97,6 +97,7 @@ SYM (_CPU_Context_restore):
.set FPCONTEXT_ARG, 4 /* FP context argument */
+#ifndef __SSE__
.p2align 1
PUBLIC (_CPU_Context_save_fp)
SYM (_CPU_Context_save_fp):
@@ -112,18 +113,44 @@ SYM (_CPU_Context_restore_fp):
movl (eax),eax /* eax = FP context area */
frstor (eax) /* restore FP context */
ret
+#endif
+
+#ifdef __SSE__
+#define SSE_OFF 16
+#endif
PUBLIC (_Exception_Handler)
SYM (_Exception_Handler):
pusha /* Push general purpose registers */
+ pushl $0 /* Null pointer to SSE area */
movl esp, ebp /* Save original SP */
+#ifndef __SSE__
subl $4, esp /* Reserve space for argument */
/* Align stack (courtesy for C/gcc) */
andl $ - CPU_STACK_ALIGNMENT, esp
+#else
+ subl $512, esp /* Space for SSE area */
+ /* Align stack (courtesy for C/gcc) */
+ andl $ - CPU_STACK_ALIGNMENT, esp
+/* Doing fwait here will re-throw an already pending FP exception!
+ fwait
+ */
+ fxsave 0(esp)
+ fninit /* Clean-slate FPU */
+ movl $0x1f80, 0(ebp)
+ ldmxcsr 0(ebp) /* Clean-slate MXCSR */
+ movl esp, 0(ebp) /* Store pointer to SSE area */
+ subl $SSE_OFF, esp /* Aligned space for argument */
+#endif
movl ebp, (esp) /* Store argument */
movl _currentExcHandler, eax /* Call function stored in _currentExcHandler */
call * eax
+#ifdef __SSE__
+ fwait
+ fxrstor 16(esp)
+#endif
movl ebp, esp /* Restore original SP */
+ addl $4, esp /* Skill pointer to SSE area */
popa /* Restore general purpose registers */
addl $8, esp /* Skill vector number and faultCode */
iret
@@ -216,6 +243,13 @@ DISTINCT_EXCEPTION_WITH_FAULTCODE_ENTRY (17)
*/
DISTINCT_EXCEPTION_WITH_FAULTCODE_ENTRY (18)
+#ifdef __SSE__
+/*
+ * SIMD FP Exception
+ */
+DISTINCT_EXCEPTION_WITHOUT_FAULTCODE_ENTRY (19)
+#endif
+
/*
* void *i386_Logical_to_physical(
diff --git a/cpukit/score/cpu/i386/rtems/score/cpu.h b/cpukit/score/cpu/i386/rtems/score/cpu.h
index c8218d1a04..77e6c6bff3 100644
--- a/cpukit/score/cpu/i386/rtems/score/cpu.h
+++ b/cpukit/score/cpu/i386/rtems/score/cpu.h
@@ -61,6 +61,15 @@ extern "C" {
* for the i386, others have it built in (i486DX, Pentium).
*/
+#ifdef __SSE__
+#define CPU_HARDWARE_FP TRUE
+#define CPU_SOFTWARE_FP FALSE
+
+#define CPU_ALL_TASKS_ARE_FP TRUE
+#define CPU_IDLE_TASK_IS_FP TRUE
+#define CPU_USE_DEFERRED_FP_SWITCH FALSE
+#else /* __SSE__ */
+
#if ( I386_HAS_FPU == 1 )
#define CPU_HARDWARE_FP TRUE /* i387 for i386 */
#else
@@ -71,6 +80,7 @@ extern "C" {
#define CPU_ALL_TASKS_ARE_FP FALSE
#define CPU_IDLE_TASK_IS_FP FALSE
#define CPU_USE_DEFERRED_FP_SWITCH TRUE
+#endif /* __SSE__ */
#define CPU_STACK_GROWS_UP FALSE
#define CPU_STRUCTURE_ALIGNMENT
@@ -119,12 +129,38 @@ typedef struct {
/*
* FP context save area for the i387 numeric coprocessors.
*/
+#ifdef __SSE__
+/* All FPU and SSE registers are volatile; hence, as long
+ * as we are within normally executing C code (including
+ * a task switch) there is no need for saving/restoring
+ * any of those registers.
+ * We must save/restore the full FPU/SSE context across
+ * interrupts and exceptions, however:
+ * - after ISR execution a _Thread_Dispatch() may happen
+ * and it is therefore necessary to save the FPU/SSE
+ * registers to be restored when control is returned
+ * to the interrupted task.
+ * - gcc may implicitly use FPU/SSE instructions in
+ * an ISR.
+ *
+ * Even though there is no explicit mentioning of the FPU
+ * control word in the SYSV ABI (i386) being non-volatile
+ * we maintain MXCSR and the FPU control-word for each task.
+ */
+typedef struct {
+ uint32_t mxcsr;
+ uint16_t fpucw;
+} Context_Control_fp;
+
+#else
typedef struct {
uint8_t fp_save_area[108]; /* context size area for I80387 */
/* 28 bytes for environment */
} Context_Control_fp;
+#endif
+
/*
* The following structure defines the set of information saved
@@ -132,9 +168,20 @@ typedef struct {
*
* idtIndex is either the interrupt number or the trap/exception number.
* faultCode is the code pushed by the processor on some exceptions.
+ *
+ * Since the first registers are directly pushed by the CPU they
+ * may not respect 16-byte stack alignment, which is, however,
+ * mandatory for the SSE register area.
+ * Therefore, these registers are stored at an aligned address
+ * and a pointer is stored in the CPU_Exception_frame.
+ * If the executive was compiled without SSE support then
+ * this pointer is NULL.
*/
+struct Context_Control_sse;
+
typedef struct {
+ struct Context_Control_sse *fp_ctxt;
uint32_t edi;
uint32_t esi;
uint32_t ebp;
@@ -150,6 +197,32 @@ typedef struct {
uint32_t eflags;
} CPU_Exception_frame;
+#ifdef __SSE__
+typedef struct Context_Control_sse {
+ uint16_t fcw;
+ uint16_t fsw;
+ uint8_t ftw;
+ uint8_t res_1;
+ uint16_t fop;
+ uint32_t fpu_ip;
+ uint16_t cs;
+ uint16_t res_2;
+ uint32_t fpu_dp;
+ uint16_t ds;
+ uint16_t res_3;
+ uint32_t mxcsr;
+ uint32_t mxcsr_mask;
+ struct {
+ uint8_t fpreg[10];
+ uint8_t res_4[ 6];
+ } fp_mmregs[8];
+ uint8_t xmmregs[8][16];
+ uint8_t res_5[224];
+} Context_Control_sse
+__attribute__((aligned(16)))
+;
+#endif
+
typedef void (*cpuExcHandlerType) (CPU_Exception_frame*);
extern cpuExcHandlerType _currentExcHandler;
extern void rtems_exception_init_mngt(void);
@@ -510,19 +583,61 @@ void _CPU_Context_restore(
* This routine saves the floating point context passed to it.
*/
+#ifdef __SSE__
+#define _CPU_Context_save_fp(fp_context_pp) \
+ do { \
+ __asm__ __volatile__( \
+ "fstcw %0" \
+ :"=m"((*(fp_context_pp))->fpucw) \
+ ); \
+ __asm__ __volatile__( \
+ "stmxcsr %0" \
+ :"=m"((*(fp_context_pp))->mxcsr) \
+ ); \
+ } while (0)
+#else
void _CPU_Context_save_fp(
Context_Control_fp **fp_context_ptr
);
+#endif
/*
* _CPU_Context_restore_fp
*
* This routine restores the floating point context passed to it.
*/
-
+#ifdef __SSE__
+#define _CPU_Context_restore_fp(fp_context_pp) \
+ do { \
+ __asm__ __volatile__( \
+ "fldcw %0" \
+ ::"m"((*(fp_context_pp))->fpucw) \
+ :"fpcr" \
+ ); \
+ __builtin_ia32_ldmxcsr(_Thread_Executing->fp_context->mxcsr); \
+ } while (0)
+#else
void _CPU_Context_restore_fp(
Context_Control_fp **fp_context_ptr
);
+#endif
+
+#ifdef __SSE__
+#define _CPU_Context_Initialization_at_thread_begin() \
+ do { \
+ __asm__ __volatile__( \
+ "finit" \
+ : \
+ : \
+ :"st","st(1)","st(2)","st(3)", \
+ "st(4)","st(5)","st(6)","st(7)", \
+ "fpsr","fpcr" \
+ ); \
+ if ( _Thread_Executing->fp_context ) { \
+ _CPU_Context_restore_fp(&_Thread_Executing->fp_context); \
+ } \
+ } while (0)
+#endif
#endif /* ASM */
diff --git a/cpukit/score/cpu/i386/sse_test.c b/cpukit/score/cpu/i386/sse_test.c
new file mode 100644
index 0000000000..a6e0fbd6bd
--- /dev/null
+++ b/cpukit/score/cpu/i386/sse_test.c
@@ -0,0 +1,951 @@
+/* $Id$ */
+
+/*
+ * Authorship
+ * ----------
+ * This software was created by
+ * Till Straumann <strauman@slac.stanford.edu>, 2009,
+ * Stanford Linear Accelerator Center, Stanford University.
+ *
+ * Acknowledgement of sponsorship
+ * ------------------------------
+ * This software was produced by
+ * the Stanford Linear Accelerator Center, Stanford University,
+ * under Contract DE-AC03-76SFO0515 with the Department of Energy.
+ *
+ * Government disclaimer of liability
+ * ----------------------------------
+ * Neither the United States nor the United States Department of Energy,
+ * nor any of their employees, makes any warranty, express or implied, or
+ * assumes any legal liability or responsibility for the accuracy,
+ * completeness, or usefulness of any data, apparatus, product, or process
+ * disclosed, or represents that its use would not infringe privately owned
+ * rights.
+ *
+ * Stanford disclaimer of liability
+ * --------------------------------
+ * Stanford University makes no representations or warranties, express or
+ * implied, nor assumes any liability for the use of this software.
+ *
+ * Stanford disclaimer of copyright
+ * --------------------------------
+ * Stanford University, owner of the copyright, hereby disclaims its
+ * copyright and all other rights in this software. Hence, anyone may
+ * freely use it for any purpose without restriction.
+ *
+ * Maintenance of notices
+ * ----------------------
+ * In the interest of clarity regarding the origin and status of this
+ * SLAC software, this and all the preceding Stanford University notices
+ * are to remain affixed to any copy or derivative of this software made
+ * or distributed by the recipient and are to be affixed to any copy of
+ * software made or distributed by the recipient that contains a copy or
+ * derivative of this software.
+ *
+ * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
+ */
+
+
+/* Code for testing FPU/SSE context save/restore across exceptions
+ * (including interrupts).
+ *
+ * There are two tasks and an IRQ/EXC handler involved. One task (LP)
+ * is of lower priority than the other (HP) task.
+ *
+ * 1) LP task sets up a context area in memory (known contents; every
+ * register is loaded with different values)
+ *
+ * 2) LP task
+ * 2a saves original FP/SSE context
+ * 2b loads context from 1) into FPU/SSE
+ * 2c raises an exception or interrupt
+ *
+ * * (2d save FPU/SSE context after irq/exception returns to
+ * separate area for verification
+ * 2e reload original FP/SSE context.)
+ *
+ * * All these five steps are coded in assembly to prevent
+ * gcc from manipulating the FP/SSE state. The last two steps,
+ * however, are effectively executed during 6 when control is
+ * returned to the LP task.
+ *
+ * 3) IRQ/EXC handler OS wrapper saves context, initializes FPU and
+ * MXCSR.
+ *
+ * 4) user (our) irq/exc handler clears exception condition, clobbers
+ * FPU and XMM regs and finally releases a semaphore on which HP
+ * task is waiting.
+ *
+ * 5) context switch to HP task. HP task clobbers FPU and XMM regs.
+ * Then it tries to re-acquire the synchronization semaphore and
+ * blocks.
+ *
+ * 6) task switch back to (interrupted) LP task. Original context is
+ * restored and verified against the context that was setup in 1).
+ *
+ *
+ * Three methods for interrupting the LP task are tested
+ *
+ * a) FP exception (by setting an exception status in the context from 1)
+ * b) SSE exception (by computing the square root of a vector of negative
+ * numbers.
+ * c) IRQ (software IRQ via 'INT xx' instruction)
+ *
+ */
+#ifdef __rtems__
+
+#include <rtems.h>
+#include <rtems/score/cpu.h>
+#include <rtems/irq.h>
+#include <rtems/error.h>
+
+#endif
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+/* This is currently hardcoded (int xx opcode requires immediate operand) */
+#define SSE_TEST_IRQ 10
+
+typedef uint8_t __v8 __attribute__((vector_size(16)));
+typedef uint32_t __v32 __attribute__((vector_size(16)));
+typedef float __vf __attribute__((vector_size(16)));
+
+#ifndef __rtems__
+/* Clone of what is defined in rtems/score/cpu.h (for testing under linux) */
+typedef struct Context_Control_sse {
+ uint16_t fcw;
+ uint16_t fsw;
+ uint8_t ftw;
+ uint8_t res_1;
+ uint16_t fop;
+ uint32_t fpu_ip;
+ uint16_t cs;
+ uint16_t res_2;
+ uint32_t fpu_dp;
+ uint16_t ds;
+ uint16_t res_3;
+ uint32_t mxcsr;
+ uint32_t mxcsr_mask;
+ struct {
+ uint8_t fpreg[10];
+ uint8_t res_4[ 6];
+ } fp_mmregs[8];
+ uint8_t xmmregs[8][16];
+ uint8_t res_5[224];
+} Context_Control_sse
+__attribute__((aligned(16)))
+;
+#endif
+
+#define MXCSR_FZ (1<<15) /* Flush to zero */
+#define MXCSR_RC(x) (((x)&3)<<13) /* Rounding ctrl */
+#define MXCSR_PM (1<<12) /* Precision msk */
+#define MXCSR_UM (1<<11) /* Underflow msk */
+#define MXCSR_OM (1<<10) /* Overflow msk */
+#define MXCSR_ZM (1<< 9) /* Divbyzero msk */
+#define MXCSR_DM (1<< 8) /* Denormal msk */
+#define MXCSR_IM (1<< 7) /* Invalidop msk */
+#define MXCSR_DAZ (1<< 6) /* Denorml are 0 */
+#define MXCSR_PE (1<< 5) /* Precision flg */
+#define MXCSR_UE (1<< 4) /* Underflow flg */
+#define MXCSR_OE (1<< 3) /* Overflow flg */
+#define MXCSR_ZE (1<< 2) /* Divbyzero flg */
+#define MXCSR_DE (1<< 1) /* Denormal flg */
+#define MXCSR_IE (1<< 0) /* Invalidop flg */
+
+#define MXCSR_ALLM (MXCSR_PM | MXCSR_UM | MXCSR_OM | MXCSR_ZM | MXCSR_DM | MXCSR_IM)
+#define MXCSR_ALLE (MXCSR_PE | MXCSR_UE | MXCSR_OE | MXCSR_ZE | MXCSR_DE | MXCSR_IE)
+
+#define FPSR_B (1<<15) /* FPU busy */
+#define FPSR_C3 (1<<14) /* Cond code C3 */
+#define FPSR_TOP(x) (((x)&7)<<11) /* TOP */
+#define FPSR_C2 (1<<10) /* Cond code C2 */
+#define FPSR_C1 (1<< 9) /* Cond code C1 */
+#define FPSR_C0 (1<< 8) /* Cond code C0 */
+#define FPSR_ES (1<< 7) /* Error summary */
+#define FPSR_SF (1<< 6) /* Stack fault */
+#define FPSR_PE (1<< 5) /* Precision flg */
+#define FPSR_UE (1<< 4) /* Underflow flg */
+#define FPSR_OE (1<< 3) /* Overflow flg */
+#define FPSR_ZE (1<< 2) /* Divbyzero flg */
+#define FPSR_DE (1<< 1) /* Denormal flg */
+#define FPSR_IE (1<< 0) /* Invalidop flg */
+
+#define FPCW_X (1<<12) /* Infinity ctrl */
+#define FPCW_RC(x) (((x)&3)<<10) /* Rounding ctrl */
+#define FPCW_PC(x) (((x)&3)<< 8) /* Precision ctl */
+#define FPCW_PM (1<< 5) /* Precision msk */
+#define FPCW_UM (1<< 4) /* Underflow msk */
+#define FPCW_OM (1<< 3) /* Overflow msk */
+#define FPCW_ZM (1<< 2) /* Divbyzero msk */
+#define FPCW_DM (1<< 1) /* Denormal msk */
+#define FPCW_IM (1<< 0) /* Invalidop msk */
+
+#define FPCW_ALLM (FPCW_PM | FPCW_UM | FPCW_OM | FPCW_ZM | FPCW_DM | FPCW_IM)
+#define FPSR_ALLE (FPSR_ES | FPSR_SF | FPSR_PE | FPSR_UE | FPSR_OE | FPSR_ZE | FPSR_DE | FPSR_IE)
+
+/* Store 'double' into 80-bit register image */
+void
+fp_st1(uint8_t (*p_dst)[10], double v)
+{
+ asm volatile("fstpt %0":"=m"(*p_dst):"t"(v):"st");
+}
+
+/* Store 'double' into 80-bit register image #i in context */
+void
+fp_st(Context_Control_sse *p_ctxt, int i, double v)
+{
+ fp_st1(&p_ctxt->fp_mmregs[i].fpreg,v);
+}
+
+/* Load 'double' from 80-bit register image */
+double
+fp_ld1(uint8_t (*p_src)[10])
+{
+double v;
+
+ asm volatile("fldt %1":"=t"(v):"m"((*p_src)[0]),"m"(*p_src));
+ return v;
+}
+
+/* Load 'double' from 80-bit register image #i in context */
+double
+fp_ld(Context_Control_sse *p_ctxt, int i)
+{
+ return fp_ld1(&p_ctxt->fp_mmregs[i].fpreg);
+}
+
+#define FPUCLOBBER \
+ "st","st(1)","st(2)","st(3)", \
+ "st(4)","st(5)","st(6)","st(7)",\
+ "fpsr","fpcr"
+
+/* There seems to be no way to say that mxcsr was clobbered */
+
+#define SSECLOBBER \
+ "xmm0","xmm1","xmm2","xmm3", \
+ "xmm4","xmm5","xmm6","xmm7"
+
+static void
+sse_clobber(uint32_t x)
+{
+__v32 v = { x, x, x, x };
+ asm volatile (
+ " movdqa %0, %%xmm0 \n"
+ " movdqa %%xmm0, %%xmm1 \n"
+ " movdqa %%xmm0, %%xmm2 \n"
+ " movdqa %%xmm0, %%xmm3 \n"
+ " movdqa %%xmm0, %%xmm4 \n"
+ " movdqa %%xmm0, %%xmm5 \n"
+ " movdqa %%xmm0, %%xmm6 \n"
+ " movdqa %%xmm0, %%xmm7 \n"
+ :
+ :"m"(v)
+ :SSECLOBBER
+ );
+}
+
+void
+all_clobber(uint32_t v1, uint32_t v2);
+
+asm(
+"all_clobber: \n"
+" finit \n"
+" movq 0(%esp), %xmm0 \n"
+" punpcklqdq %xmm0, %xmm0 \n"
+" movdqa %xmm0, %xmm1 \n"
+" movdqa %xmm0, %xmm2 \n"
+" movdqa %xmm0, %xmm3 \n"
+" movdqa %xmm0, %xmm4 \n"
+" movdqa %xmm0, %xmm5 \n"
+" movdqa %xmm0, %xmm6 \n"
+" movdqa %xmm0, %xmm7 \n"
+" ret \n"
+);
+
+/* Clear FPU and save FPU/SSE registers to context area */
+
+void
+init_ctxt(Context_Control_sse *p_ctxt);
+
+asm(
+"init_ctxt: \n"
+" finit \n"
+" mov 4(%esp), %eax\n"
+" fxsave (%eax) \n"
+" fwait \n"
+" ret \n"
+);
+
+/* Save FPU/SSE registers to context area */
+
+static void
+stor_ctxt(Context_Control_sse *p_ctxt)
+{
+ memset(p_ctxt, 0, sizeof(*p_ctxt));
+ asm volatile(
+/* " finit \n" */
+ " fxsave %0 \n"
+ " fwait \n"
+ : "=m"(*p_ctxt)
+ :
+ : FPUCLOBBER
+ );
+}
+
+#define H08 "0x%02"PRIx8
+#define H16 "0x%04"PRIx16
+#define H32 "0x%08"PRIx32
+
+#define F16 "mismatch ("H16" != "H16")\n"
+
+#define FLDCMP(fld, fmt) \
+ if ( a->fld != b->fld ) { \
+ rval = 1; \
+ if ( !quiet ) \
+ fprintf(stderr,#fld" mismatch ("fmt" != "fmt")\n",a->fld, b->fld); \
+ }
+
+#define FLTCMP(i) \
+ do { \
+ if ( ( (a->ftw ^ b->ftw) & (1<<i)) \
+ || ( (a->ftw & b->ftw & (1<<i)) && \
+ memcmp(a->fp_mmregs[i].fpreg, \
+ b->fp_mmregs[i].fpreg, \
+ sizeof(a->fp_mmregs[i].fpreg)) \
+ ) \
+ ) { \
+ rval = 1; \
+ if ( !quiet ) { \
+ double fa = fp_ld(a, i); \
+ double fb = fp_ld(b, i); \
+ if ( ((a->ftw ^ b->ftw) & (1<<i)) ) \
+ fprintf(stderr,"fpreg[%u] TAG mismatch (%u != %u)\n",i,(a->ftw & (1<<i)) ? 1 : 0,(b->ftw & (1<<i)) ? 1 : 0); \
+ else \
+ fprintf(stderr,"fpreg[%u] mismatch (%g != %g)\n",i,fa,fb); \
+ } \
+ } \
+ } while (0)
+
+#define XMMCMP(i) \
+ do { \
+ if ( memcmp(&a->xmmregs[i], \
+ &b->xmmregs[i], \
+ sizeof(a->xmmregs[i])) \
+ ) { \
+ rval = 1; \
+ if ( !quiet ) { \
+ int _jj; \
+ fprintf(stderr,"xmmreg[%u] mismatch:\n", i); \
+ fprintf(stderr," "); \
+ for (_jj=0; _jj<16; _jj++) \
+ fprintf(stderr,"%02x ",a->xmmregs[i][_jj]); \
+ fprintf(stderr,"\n !=\n"); \
+ fprintf(stderr," "); \
+ for (_jj=0; _jj<16; _jj++) \
+ fprintf(stderr,"%02x ",b->xmmregs[i][_jj]); \
+ fprintf(stderr,"\n"); \
+ } \
+ } \
+ } while (0)
+
+
+/* Compare two FPU/SSE context areas and flag differences;
+ * RETURNS: zero if the contexts match and nonzero otherwise
+ */
+static int
+cmp_ctxt(Context_Control_sse *a, Context_Control_sse *b, int quiet)
+{
+int rval = 0;
+int i;
+ FLDCMP(fcw,H16);
+ FLDCMP(fsw,H16);
+ FLDCMP(ftw,H08);
+ FLDCMP(fop,H16);
+ FLDCMP(fpu_ip,H32);
+ FLDCMP(cs,H16);
+ FLDCMP(fpu_dp,H32);
+ FLDCMP(ds,H16);
+ FLDCMP(mxcsr,H32);
+ FLDCMP(mxcsr_mask,H32);
+ for ( i=0; i<8; i++ ) {
+ FLTCMP(i);
+ }
+ for ( i=0; i<8; i++ ) {
+ XMMCMP(i);
+ }
+ return rval;
+}
+
+/* Possible arguments to exc_raise() */
+
+#define FP_EXC 0
+#define IRQ_EXC 1
+#define SSE_EXC -1
+
+/* Check stack alignment by raising the interrupt from a
+ * non-16-byte aligned section of code. The exception/IRQ
+ * handler must align the stack and SSE context area
+ * properly or it will crash.
+ */
+#define __INTRAISE(x) " int $32+"#x" \n"
+#define INTRAISE(x) __INTRAISE(x)
+
+asm(
+"do_raise: \n"
+" fwait \n"
+" test %eax, %eax \n"
+" je 2f \n"
+" jl 1f \n"
+INTRAISE(SSE_TEST_IRQ)
+" jmp 2f \n"
+"1: sqrtps %xmm0, %xmm0 \n"
+"2: \n"
+" ret \n"
+);
+
+#define SSE_TEST_HP_FAILED 1
+#define SSE_TEST_FSPR_FAILED 2
+#define SSE_TEST_CTXTCMP_FAILED 4
+
+static const char *fail_msgs[] = {
+ "Seems that HP task was not executing",
+ "FPSR 'Invalid-operation' flag should be clear",
+ "Restored context does NOT match the saved one",
+};
+
+static void prstat(int st, const char *where)
+{
+int i,msk;
+ for ( i=0, msk=1; i<sizeof(fail_msgs)/sizeof(fail_msgs[0]); i++, msk<<=1 ) {
+ if ( (st & msk) ) {
+ fprintf(stderr,"sse_test ERROR: %s (testing: %s)\n", fail_msgs[i], where);
+ }
+ }
+}
+
+int sse_test_debug = 0;
+
+static int
+exc_raise(int kind)
+{
+Context_Control_sse nctxt;
+Context_Control_sse octxt;
+Context_Control_sse orig_ctxt;
+int i,j,rval;
+double s2;
+uint16_t fsw;
+__vf f4 = { -1., -2., -3., -4. };
+__vf tmp;
+__v32 sgn = { (1<<31), (1<<31), (1<<31), (1<<31) };
+
+ stor_ctxt(&octxt);
+
+ octxt.fsw &= ~FPSR_ALLE;
+ octxt.mxcsr &= ~MXCSR_ALLE;
+
+ for ( i=0; i<8; i++ ) {
+ fp_st(&octxt, i, (double)i+0.1);
+ for (j=0; j<16; j++) {
+ octxt.xmmregs[i][j]=(i<<4)+j;
+ }
+ }
+
+
+ if ( SSE_EXC == kind ) {
+ memcpy(octxt.xmmregs[0], &f4, sizeof(f4));
+ octxt.mxcsr &= ~MXCSR_IM;
+ }
+
+ /* set tags to 'valid' */
+ octxt.ftw = 0xff;
+
+ /* enable 'invalid arg' exception */
+ octxt.fcw &= ~ ( FPCW_IM );
+
+ if ( FP_EXC == kind ) {
+ octxt.fsw |= ( FPSR_IE | FPSR_ES );
+ }
+
+ if ( sse_test_debug )
+ printk("RAISE (fsw was 0x%04x)\n", orig_ctxt.fsw);
+ asm volatile(
+ " fxsave %2 \n"
+#ifdef __rtems__
+ " movl %4, sse_test_check\n"
+#endif
+ " fxrstor %3 \n"
+ " call do_raise \n"
+#ifdef __rtems__
+ " movl sse_test_check, %1\n"
+#else
+ " movl $0, %1 \n"
+#endif
+#ifdef TEST_MISMATCH
+ " pxor %%xmm0, %%xmm0 \n"
+#endif
+ " fxsave %0 \n"
+ " fxrstor %2 \n"
+ : "=m"(nctxt),"=&r"(rval),"=m"(orig_ctxt)
+ : "m"(octxt), "i"(SSE_TEST_HP_FAILED),"a"(kind)
+ : "xmm0"
+ );
+
+ if ( ( FPSR_IE & nctxt.fsw ) ) {
+ rval |= SSE_TEST_FSPR_FAILED;
+ }
+ if ( FP_EXC == kind )
+ nctxt.fsw |= (FPSR_IE | FPSR_ES);
+ else if ( SSE_EXC == kind ) {
+ tmp = __builtin_ia32_sqrtps( (__vf)(~sgn & (__v32)f4) );
+ /* sqrt raises PE; just clear it */
+ nctxt.mxcsr &= ~MXCSR_PE;
+ memcpy( octxt.xmmregs[0], &tmp, sizeof(tmp) );
+ }
+
+ if ( cmp_ctxt(&nctxt, &octxt, 0) ) {
+ rval |= SSE_TEST_CTXTCMP_FAILED;
+ }
+
+ s2 = sqrt(2.0);
+
+ asm volatile("fstsw %0":"=m"(fsw));
+
+ if ( sse_test_debug )
+ printf("sqrt(2): %f (FSTW: 0x%02"PRIx16")\n", sqrt(2.0), fsw);
+
+ return rval;
+}
+
+#ifdef __rtems__
+static void
+sse_test_ehdl(CPU_Exception_frame *p_f);
+
+rtems_id sse_test_sync = 0;
+cpuExcHandlerType sse_test_ohdl = 0;
+
+CPU_Exception_frame *sse_test_frame = 0;
+volatile int sse_test_check = SSE_TEST_HP_FAILED;
+unsigned sse_tests = 0;
+
+rtems_task
+sse_test_hp_task(rtems_task_argument arg)
+{
+rtems_id sync = (rtems_id)arg;
+
+uint16_t fp_cw;
+uint32_t mxcsr;
+rtems_status_code sc;
+const char * msgs[] = {"FPU_EXC", "SSE_EXC", "IRQ_EXC"};
+int i;
+
+ /* verify that FPU control word is default value */
+ asm volatile("fstcw %0":"=m"(fp_cw));
+ if ( fp_cw != _CPU_Null_fp_context.fpucw ) {
+ fprintf(
+ stderr,
+ "ERROR: FPU CW initialization mismatch: got 0x%04"PRIx16"; expected 0x%04"PRIx16"\n",
+ fp_cw,
+ _CPU_Null_fp_context.fpucw
+ );
+ }
+
+ /* check MXCSR default value */
+ asm volatile("stmxcsr %0":"=m"(mxcsr));
+ if ( mxcsr != _CPU_Null_fp_context.mxcsr ) {
+ fprintf(
+ stderr,
+ "ERROR: MXCSR initialization mismatch: got 0x%08"PRIx32"; expected 0x%08"PRIx32"\n",
+ mxcsr,
+ _CPU_Null_fp_context.mxcsr
+ );
+ }
+
+
+ for (i=0; i<sizeof(msgs)/sizeof(msgs[0]); i++ ) {
+ if ( ( sse_tests & (1<<i) ) ) {
+ if ( sse_test_debug )
+ printk("HP task will now block for %s\n",msgs[i]);
+
+ /* Blocking here lets the low-priority task continue */
+ sc = rtems_semaphore_obtain(sync, RTEMS_WAIT, 500);
+
+ all_clobber(0xaffeaffe, 0xcafecafe);
+
+ if ( RTEMS_SUCCESSFUL != sc ) {
+ rtems_error(sc,"ERROR: sse_test hp task wasn't notified of exception\n");
+ goto bail;
+ }
+
+ /* set flag indicating that we executed until here */
+ sse_test_check = 0;
+ }
+ }
+
+bail:
+ rtems_task_suspend(RTEMS_SELF);
+}
+
+/* Flags to skip individual tests */
+#define SSE_TEST_FPU_EXC (1<<0)
+#define SSE_TEST_SSE_EXC (1<<1)
+#define SSE_TEST_IRQ_EXC (1<<2)
+
+#define SSE_TEST_ALL 7
+
+/* If this flag is given the executing task is not deleted
+ * when the test finishes. This is useful if you want to
+ * execute from a shell or similar.
+ */
+#define SSE_TEST_NO_DEL (1<<0)
+
+/* Task arg is bitmask of these flags */
+rtems_task
+sse_test_lp_task(rtems_task_argument arg)
+{
+rtems_id hp_task = 0;
+rtems_status_code sc;
+rtems_task_priority pri;
+uint16_t fp_cw,fp_cw_set;
+uint32_t mxcsr, mxcsr_set;
+rtems_irq_connect_data irqd;
+int flags = (int)arg;
+int st;
+int errs = 0;
+
+ sse_tests = SSE_TEST_ALL & ~(flags>>1);
+
+ sse_test_ohdl = 0;
+
+ fp_cw_set = _CPU_Null_fp_context.fpucw | FPCW_RC(3) ;
+ mxcsr_set = _CPU_Null_fp_context.mxcsr | MXCSR_RC(3) ;
+ asm volatile("ldmxcsr %0"::"m"(mxcsr_set));
+ asm volatile("fldcw %0"::"m"(fp_cw_set));
+
+ sc = rtems_semaphore_create(
+ rtems_build_name('s','s','e','S'),
+ 0,
+ RTEMS_SIMPLE_BINARY_SEMAPHORE,
+ 0,
+ &sse_test_sync
+ );
+ if ( RTEMS_SUCCESSFUL != sc ) {
+ rtems_error(sc, "sse_test ERROR: creation of 'sync' semaphore failed");
+ errs++;
+ goto bail;
+ }
+
+ rtems_task_set_priority( RTEMS_SELF, RTEMS_CURRENT_PRIORITY, &pri );
+
+ sc = rtems_task_create(
+ rtems_build_name('s','s','e','H'),
+ pri - 2,
+ 20000,
+ RTEMS_DEFAULT_MODES,
+ RTEMS_FLOATING_POINT,
+ &hp_task
+ );
+ if ( RTEMS_SUCCESSFUL != sc ) {
+ hp_task = 0;
+ rtems_error( sc, "sse_test ERROR: creation of high-priority task failed");
+ errs++;
+ goto bail;
+ }
+
+ sc = rtems_task_start( hp_task, sse_test_hp_task, (rtems_task_argument)sse_test_sync );
+ if ( RTEMS_SUCCESSFUL != sc ) {
+ rtems_error( sc, "sse_test ERROR: start of high-priority task failed");
+ goto bail;
+ }
+
+ /* Test if FP/SSE context is saved/restored across an exception */
+ sse_test_ohdl = _currentExcHandler;
+ _currentExcHandler = sse_test_ehdl;
+
+ if ( (sse_tests & SSE_TEST_FPU_EXC) ) {
+ if ( (st = exc_raise(FP_EXC)) ) {
+ prstat(st,"FP_EXC");
+ errs++;
+ }
+
+ /* Test modified FPCW/MXCSR */
+ asm volatile("fstcw %0":"=m"(fp_cw));
+ asm volatile("stmxcsr %0":"=m"(mxcsr));
+ mxcsr &= ~(MXCSR_ALLE);
+ if ( fp_cw != fp_cw_set ) {
+ fprintf(stderr,"sse_test ERROR: FPCW mismatch (after FP_EXC): expected 0x%04"PRIx16", got 0x%04"PRIx16"\n", fp_cw_set, fp_cw);
+ errs++;
+ }
+ if ( mxcsr != mxcsr_set ) {
+ fprintf(stderr,"sse_test ERROR: MXCSR mismatch (after FP_EXC): expected 0x%08"PRIx32", got 0x%08"PRIx32"\n", mxcsr_set, mxcsr);
+ errs++;
+ }
+ }
+
+ if ( (sse_tests & SSE_TEST_SSE_EXC) ) {
+ if ( (st = exc_raise(SSE_EXC)) ) {
+ prstat(st, "SSE_EXC");
+ errs++;
+ }
+
+ /* Test modified FPCW/MXCSR */
+ asm volatile("fstcw %0":"=m"(fp_cw));
+ asm volatile("stmxcsr %0":"=m"(mxcsr));
+ mxcsr &= ~(MXCSR_ALLE);
+ if ( fp_cw != fp_cw_set ) {
+ fprintf(stderr,"sse_test ERROR: FPCW mismatch (after SSE_EXC): expected 0x%04"PRIx16", got 0x%04"PRIx16"\n", fp_cw_set, fp_cw);
+ errs++;
+ }
+ if ( mxcsr != mxcsr_set ) {
+ fprintf(stderr,"sse_test ERROR: MXCSR mismatch (after SSE_EXC): expected 0x%08"PRIx32", got 0x%08"PRIx32"\n", mxcsr_set, mxcsr);
+ errs++;
+ }
+ }
+
+
+ if ( (sse_tests & SSE_TEST_IRQ_EXC) ) {
+ memset( &irqd, 0, sizeof(irqd) );
+ irqd.name = SSE_TEST_IRQ;
+ irqd.hdl = (void*)sse_test_ehdl;
+ irqd.handle = 0;
+
+ if ( ! BSP_install_rtems_irq_handler( &irqd ) ) {
+ fprintf(stderr, "sse_test ERROR: Unable to install ISR\n");
+ errs++;
+ goto bail;
+ }
+
+ /* Test if FP/SSE context is saved/restored across an interrupt */
+ if ( (st = exc_raise(IRQ_EXC)) ) {
+ prstat(st, "IRQ");
+ errs++;
+ }
+
+ if ( ! BSP_remove_rtems_irq_handler( &irqd ) ) {
+ fprintf(stderr, "sse_test ERROR: Unable to uninstall ISR\n");
+ }
+
+ /* Test modified FPCW/MXCSR */
+ asm volatile("fstcw %0":"=m"(fp_cw));
+ asm volatile("stmxcsr %0":"=m"(mxcsr));
+ mxcsr &= ~(MXCSR_ALLE);
+ if ( fp_cw != fp_cw_set ) {
+ fprintf(stderr,"sse_test ERROR: FPCW mismatch (after IRQ): expected 0x%04"PRIx16", got 0x%04"PRIx16"\n", fp_cw_set, fp_cw);
+ errs++;
+ }
+ if ( mxcsr != mxcsr_set ) {
+ fprintf(stderr,"sse_test ERROR: MXCSR mismatch (after IRQ): expected 0x%08"PRIx32", got 0x%08"PRIx32"\n", mxcsr_set, mxcsr);
+ errs++;
+ }
+ }
+
+
+bail:
+ /* Wait for console to calm down... */
+ rtems_task_wake_after(5);
+ fprintf(stderr,"SSE/FPU Test %s (%u errors)\n", errs ? "FAILED":"PASSED", errs);
+ if ( sse_test_ohdl ) {
+ _currentExcHandler = sse_test_ohdl;
+ sse_test_ohdl = 0;
+ }
+ if ( sse_test_sync )
+ rtems_semaphore_delete( sse_test_sync );
+ sse_test_sync = 0;
+ if ( hp_task )
+ rtems_task_delete( hp_task );
+
+ if ( ! (flags & SSE_TEST_NO_DEL) )
+ rtems_task_delete( RTEMS_SELF );
+}
+
+static void
+sse_test_ehdl(CPU_Exception_frame *p_f)
+{
+int i,j,start = 0;
+int mismatch;
+__vf f4;
+
+ if ( p_f ) {
+ printk("Got exception #%u\n", p_f->idtIndex);
+ printk("EIP: 0x%08x, ESP: 0x%08x\n", p_f->eip, p_f->esp0);
+ printk("TID: 0x%08x\n", _Thread_Executing->Object.id);
+
+ if ( ! p_f->fp_ctxt ) {
+ printk("ERROR: NO FP/SSE CONTEXT ATTACHED ??\n");
+ sse_test_ohdl(p_f);
+ }
+ if ( 16 == p_f->idtIndex ) {
+ printk("Resetting FP status (0x%04"PRIx16")\n", p_f->fp_ctxt->fsw);
+ p_f->fp_ctxt->fsw = 0;
+ } else if ( 19 == p_f->idtIndex ) {
+ start = 1;
+ memcpy(&f4, p_f->fp_ctxt->xmmregs[0], sizeof(f4));
+ f4 = -f4;
+ memcpy(p_f->fp_ctxt->xmmregs[0], &f4, sizeof(f4));
+ p_f->fp_ctxt->mxcsr &= ~MXCSR_ALLE;
+ } else {
+ printk("(skipping non-FP exception)\n");
+ sse_test_ohdl(p_f);
+ }
+
+ printk("Checking XMM regs -- ");
+ for ( mismatch=0, i=start; i<8; i++ ) {
+ for ( j=0; j<16; j++ ) {
+ if ( p_f->fp_ctxt->xmmregs[i][j] != ((i<<4) | j) )
+ mismatch++;
+ }
+ }
+ if ( mismatch ) {
+ printk("%u mismatches; dump:\n", mismatch);
+ for ( i=0; i<8; i++ ) {
+ for ( j=0; j<16; j++ ) {
+ printk("0x%02x ", p_f->fp_ctxt->xmmregs[i][j]);
+ }
+ printk("\n");
+ }
+ } else {
+ printk("OK\n");
+ }
+ } else {
+ printk("IRQ %u\n", SSE_TEST_IRQ);
+ }
+ printk("Clobbering FPU/SSE state\n");
+ asm volatile("finit");
+ sse_clobber(0xdeadbeef);
+ printk("Notifying task\n");
+ rtems_semaphore_release( sse_test_sync );
+}
+
+#else
+
+/* Code using signals for testing under linux; unfortunately, 32-bit
+ * linux seems to pass no SSE context info to the sigaction...
+ */
+
+#include <signal.h>
+#include <ucontext.h>
+
+#define MKCASE(X) case FPE_##X: msg="FPE_"#X; break;
+
+#define CLRXMM(i) asm volatile("pxor %%xmm"#i", %%xmm"#i:::"xmm"#i)
+
+static void
+fpe_act(int signum, siginfo_t *p_info, void *arg3)
+{
+ucontext_t *p_ctxt = arg3;
+const char *msg = "FPE_UNKNOWN";
+uint16_t *p_fst;
+
+ if ( SIGFPE != signum ) {
+ fprintf(stderr,"WARNING: fpe_act handles SIGFPE\n");
+ return;
+ }
+ switch ( p_info->si_code ) {
+ default:
+ fprintf(stderr,"WARNING: fpe_act got unkown code %u\n", p_info->si_code);
+ return;
+ MKCASE(INTDIV);
+ MKCASE(INTOVF);
+ MKCASE(FLTDIV);
+ MKCASE(FLTOVF);
+ MKCASE(FLTUND);
+ MKCASE(FLTRES);
+ MKCASE(FLTINV);
+ MKCASE(FLTSUB);
+ }
+ fprintf(stderr,"Got SIGFPE (%s) @%p\n", msg, p_info->si_addr);
+#ifdef __linux__
+ fprintf(stderr,"Resetting FP status 0x%02lx\n", p_ctxt->uc_mcontext.fpregs->sw);
+ p_ctxt->uc_mcontext.fpregs->sw = 0;
+#ifdef TEST_MISMATCH
+ fp_st1((void*)&p_ctxt->uc_mcontext.fpregs->_st[3],2.345);
+#endif
+#endif
+
+ /* Clear FPU; if context is properly saved/restored around exception
+ * then this shouldn't disturb the register contents of the interrupted
+ * task/process.
+ */
+ asm volatile("finit");
+ sse_clobber(0xdeadbeef);
+}
+
+static void
+test(void)
+{
+Context_Control_sse ctxt;
+
+ stor_ctxt(&ctxt);
+ printf("FPCW: 0x%"PRIx16"\nFPSW: 0x%"PRIx16"\n", ctxt.fcw, ctxt.fsw);
+ printf("FTAG: 0x%"PRIx8"\n",ctxt.ftw);
+}
+
+int
+main(int argc, char **argv)
+{
+struct sigaction a1, a2;
+uint32_t mxcsr;
+
+ memset(&a1, 0, sizeof(a1));
+
+ a1.sa_sigaction = fpe_act;
+ a1.sa_flags = SA_SIGINFO;
+
+ if ( sigaction(SIGFPE, &a1, &a2) ) {
+ perror("sigaction");
+ return 1;
+ }
+
+ asm volatile("stmxcsr %0":"=m"(mxcsr));
+ printf("MXCSR: 0x%08"PRIx32"\n", mxcsr);
+
+ test();
+ exc_raise(0);
+ return 0;
+}
+#endif
+
+/* Helpers to access CR4 and MXCSR */
+
+uint32_t
+mfcr4()
+{
+uint32_t rval;
+ asm volatile("mov %%cr4, %0":"=r"(rval));
+ return rval;
+}
+
+void
+mtcr4(uint32_t rval)
+{
+ asm volatile("mov %0, %%cr4"::"r"(rval));
+}
+
+uint32_t
+mfmxcsr()
+{
+uint32_t rval;
+ asm volatile("stmxcsr %0":"=m"(rval));
+ return rval;
+}
+
+void
+mtmxcsr(uint32_t rval)
+{
+ asm volatile("ldmxcsr %0"::"m"(rval));
+}
+
+
+float
+sseraise()
+{
+__vf f4={-2., -2., -2. -2.};
+float f;
+ f4 = __builtin_ia32_sqrtps( f4 );
+ memcpy(&f,&f4,sizeof(f));
+ return f;
+}