/** * @file * * @brief Test FPU/SSE Context Save and Restore */ /* * Authorship * ---------- * This software was created by * Till Straumann , 2009, * Stanford Linear Accelerator Center, Stanford University. * * Acknowledgement of sponsorship * ------------------------------ * This software was produced by * the Stanford Linear Accelerator Center, Stanford University, * under Contract DE-AC03-76SFO0515 with the Department of Energy. * * Government disclaimer of liability * ---------------------------------- * Neither the United States nor the United States Department of Energy, * nor any of their employees, makes any warranty, express or implied, or * assumes any legal liability or responsibility for the accuracy, * completeness, or usefulness of any data, apparatus, product, or process * disclosed, or represents that its use would not infringe privately owned * rights. * * Stanford disclaimer of liability * -------------------------------- * Stanford University makes no representations or warranties, express or * implied, nor assumes any liability for the use of this software. * * Stanford disclaimer of copyright * -------------------------------- * Stanford University, owner of the copyright, hereby disclaims its * copyright and all other rights in this software. Hence, anyone may * freely use it for any purpose without restriction. * * Maintenance of notices * ---------------------- * In the interest of clarity regarding the origin and status of this * SLAC software, this and all the preceding Stanford University notices * are to remain affixed to any copy or derivative of this software made * or distributed by the recipient and are to be affixed to any copy of * software made or distributed by the recipient that contains a copy or * derivative of this software. * * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03 */ /* Code for testing FPU/SSE context save/restore across exceptions * (including interrupts). * * There are two tasks and an IRQ/EXC handler involved. One task (LP) * is of lower priority than the other (HP) task. * * 1) LP task sets up a context area in memory (known contents; every * register is loaded with different values) * * 2) LP task * 2a saves original FP/SSE context * 2b loads context from 1) into FPU/SSE * 2c raises an exception or interrupt * * * (2d save FPU/SSE context after irq/exception returns to * separate area for verification * 2e reload original FP/SSE context.) * * * All these five steps are coded in assembly to prevent * gcc from manipulating the FP/SSE state. The last two steps, * however, are effectively executed during 6 when control is * returned to the LP task. * * 3) IRQ/EXC handler OS wrapper saves context, initializes FPU and * MXCSR. * * 4) user (our) irq/exc handler clears exception condition, clobbers * FPU and XMM regs and finally releases a semaphore on which HP * task is waiting. * * 5) context switch to HP task. HP task clobbers FPU and XMM regs. * Then it tries to re-acquire the synchronization semaphore and * blocks. * * 6) task switch back to (interrupted) LP task. Original context is * restored and verified against the context that was setup in 1). * * * Three methods for interrupting the LP task are tested * * a) FP exception (by setting an exception status in the context from 1) * b) SSE exception (by computing the square root of a vector of negative * numbers. * c) IRQ (software IRQ via 'INT xx' instruction) * */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #ifdef __rtems__ #include #include #include #include #endif #include #include #include #include #include /* This is currently hardcoded (int xx opcode requires immediate operand) */ #define SSE_TEST_IRQ 10 typedef uint8_t __v8 __attribute__((vector_size(16))); typedef uint32_t __v32 __attribute__((vector_size(16))); typedef float __vf __attribute__((vector_size(16))); #ifndef __rtems__ /* Clone of what is defined in rtems/score/cpu.h (for testing under linux) */ typedef struct Context_Control_sse { uint16_t fcw; uint16_t fsw; uint8_t ftw; uint8_t res_1; uint16_t fop; uint32_t fpu_ip; uint16_t cs; uint16_t res_2; uint32_t fpu_dp; uint16_t ds; uint16_t res_3; uint32_t mxcsr; uint32_t mxcsr_mask; struct { uint8_t fpreg[10]; uint8_t res_4[ 6]; } fp_mmregs[8]; uint8_t xmmregs[8][16]; uint8_t res_5[224]; } Context_Control_sse __attribute__((aligned(16))) ; #endif #define MXCSR_FZ (1<<15) /* Flush to zero */ #define MXCSR_RC(x) (((x)&3)<<13) /* Rounding ctrl */ #define MXCSR_PM (1<<12) /* Precision msk */ #define MXCSR_UM (1<<11) /* Underflow msk */ #define MXCSR_OM (1<<10) /* Overflow msk */ #define MXCSR_ZM (1<< 9) /* Divbyzero msk */ #define MXCSR_DM (1<< 8) /* Denormal msk */ #define MXCSR_IM (1<< 7) /* Invalidop msk */ #define MXCSR_DAZ (1<< 6) /* Denorml are 0 */ #define MXCSR_PE (1<< 5) /* Precision flg */ #define MXCSR_UE (1<< 4) /* Underflow flg */ #define MXCSR_OE (1<< 3) /* Overflow flg */ #define MXCSR_ZE (1<< 2) /* Divbyzero flg */ #define MXCSR_DE (1<< 1) /* Denormal flg */ #define MXCSR_IE (1<< 0) /* Invalidop flg */ #define MXCSR_ALLM (MXCSR_PM | MXCSR_UM | MXCSR_OM | MXCSR_ZM | MXCSR_DM | MXCSR_IM) #define MXCSR_ALLE (MXCSR_PE | MXCSR_UE | MXCSR_OE | MXCSR_ZE | MXCSR_DE | MXCSR_IE) #define FPSR_B (1<<15) /* FPU busy */ #define FPSR_C3 (1<<14) /* Cond code C3 */ #define FPSR_TOP(x) (((x)&7)<<11) /* TOP */ #define FPSR_C2 (1<<10) /* Cond code C2 */ #define FPSR_C1 (1<< 9) /* Cond code C1 */ #define FPSR_C0 (1<< 8) /* Cond code C0 */ #define FPSR_ES (1<< 7) /* Error summary */ #define FPSR_SF (1<< 6) /* Stack fault */ #define FPSR_PE (1<< 5) /* Precision flg */ #define FPSR_UE (1<< 4) /* Underflow flg */ #define FPSR_OE (1<< 3) /* Overflow flg */ #define FPSR_ZE (1<< 2) /* Divbyzero flg */ #define FPSR_DE (1<< 1) /* Denormal flg */ #define FPSR_IE (1<< 0) /* Invalidop flg */ #define FPCW_X (1<<12) /* Infinity ctrl */ #define FPCW_RC(x) (((x)&3)<<10) /* Rounding ctrl */ #define FPCW_PC(x) (((x)&3)<< 8) /* Precision ctl */ #define FPCW_PM (1<< 5) /* Precision msk */ #define FPCW_UM (1<< 4) /* Underflow msk */ #define FPCW_OM (1<< 3) /* Overflow msk */ #define FPCW_ZM (1<< 2) /* Divbyzero msk */ #define FPCW_DM (1<< 1) /* Denormal msk */ #define FPCW_IM (1<< 0) /* Invalidop msk */ #define FPCW_ALLM (FPCW_PM | FPCW_UM | FPCW_OM | FPCW_ZM | FPCW_DM | FPCW_IM) #define FPSR_ALLE (FPSR_ES | FPSR_SF | FPSR_PE | FPSR_UE | FPSR_OE | FPSR_ZE | FPSR_DE | FPSR_IE) /* Store 'double' into 80-bit register image */ void fp_st1(uint8_t (*p_dst)[10], double v) { asm volatile("fstpt %0":"=m"(*p_dst):"t"(v):"st"); } /* Store 'double' into 80-bit register image #i in context */ void fp_st(Context_Control_sse *p_ctxt, int i, double v) { fp_st1(&p_ctxt->fp_mmregs[i].fpreg,v); } /* Load 'double' from 80-bit register image */ double fp_ld1(uint8_t (*p_src)[10]) { double v; asm volatile("fldt %1":"=t"(v):"m"((*p_src)[0]),"m"(*p_src)); return v; } /* Load 'double' from 80-bit register image #i in context */ double fp_ld(Context_Control_sse *p_ctxt, int i) { return fp_ld1(&p_ctxt->fp_mmregs[i].fpreg); } #define FPUCLOBBER \ "st","st(1)","st(2)","st(3)", \ "st(4)","st(5)","st(6)","st(7)",\ "fpsr","memory" /* There seems to be no way to say that mxcsr was clobbered */ #define SSECLOBBER \ "xmm0","xmm1","xmm2","xmm3", \ "xmm4","xmm5","xmm6","xmm7" static void sse_clobber(uint32_t x) { __v32 v = { x, x, x, x }; asm volatile ( " movdqa %0, %%xmm0 \n" " movdqa %%xmm0, %%xmm1 \n" " movdqa %%xmm0, %%xmm2 \n" " movdqa %%xmm0, %%xmm3 \n" " movdqa %%xmm0, %%xmm4 \n" " movdqa %%xmm0, %%xmm5 \n" " movdqa %%xmm0, %%xmm6 \n" " movdqa %%xmm0, %%xmm7 \n" : :"m"(v) :SSECLOBBER ); } void all_clobber(uint32_t v1, uint32_t v2); __asm__ ( "all_clobber: \n" " finit \n" " movq 0(%esp), %xmm0 \n" " punpcklqdq %xmm0, %xmm0 \n" " movdqa %xmm0, %xmm1 \n" " movdqa %xmm0, %xmm2 \n" " movdqa %xmm0, %xmm3 \n" " movdqa %xmm0, %xmm4 \n" " movdqa %xmm0, %xmm5 \n" " movdqa %xmm0, %xmm6 \n" " movdqa %xmm0, %xmm7 \n" " ret \n" ); /* Clear FPU and save FPU/SSE registers to context area */ void init_ctxt(Context_Control_sse *p_ctxt); __asm__ ( "init_ctxt: \n" " finit \n" " mov 4(%esp), %eax\n" " fxsave (%eax) \n" " fwait \n" " ret \n" ); /* Save FPU/SSE registers to context area */ static void stor_ctxt(Context_Control_sse *p_ctxt) { memset(p_ctxt, 0, sizeof(*p_ctxt)); asm volatile( /* " finit \n" */ " fxsave %0 \n" " fwait \n" : "=m"(*p_ctxt) : : FPUCLOBBER ); } #define H08 "0x%02"PRIx8 #define H16 "0x%04"PRIx16 #define H32 "0x%08"PRIx32 #define F16 "mismatch ("H16" != "H16")\n" #define FLDCMP(fld, fmt) \ if ( a->fld != b->fld ) { \ rval = 1; \ if ( !quiet ) \ fprintf(stderr,#fld" mismatch ("fmt" != "fmt")\n",a->fld, b->fld); \ } #define FLTCMP(i) \ do { \ if ( ( (a->ftw ^ b->ftw) & (1<ftw & b->ftw & (1<fp_mmregs[i].fpreg, \ b->fp_mmregs[i].fpreg, \ sizeof(a->fp_mmregs[i].fpreg)) \ ) \ ) { \ rval = 1; \ if ( !quiet ) { \ double fa = fp_ld(a, i); \ double fb = fp_ld(b, i); \ if ( ((a->ftw ^ b->ftw) & (1<ftw & (1<ftw & (1<xmmregs[i], \ &b->xmmregs[i], \ sizeof(a->xmmregs[i])) \ ) { \ rval = 1; \ if ( !quiet ) { \ int _jj; \ fprintf(stderr,"xmmreg[%u] mismatch:\n", i); \ fprintf(stderr," "); \ for (_jj=0; _jj<16; _jj++) \ fprintf(stderr,"%02x ",a->xmmregs[i][_jj]); \ fprintf(stderr,"\n !=\n"); \ fprintf(stderr," "); \ for (_jj=0; _jj<16; _jj++) \ fprintf(stderr,"%02x ",b->xmmregs[i][_jj]); \ fprintf(stderr,"\n"); \ } \ } \ } while (0) /* Compare two FPU/SSE context areas and flag differences; * RETURNS: zero if the contexts match and nonzero otherwise */ static int cmp_ctxt(Context_Control_sse *a, Context_Control_sse *b, int quiet) { int rval = 0; int i; FLDCMP(fcw,H16); FLDCMP(fsw,H16); FLDCMP(ftw,H08); FLDCMP(fop,H16); FLDCMP(fpu_ip,H32); FLDCMP(cs,H16); FLDCMP(fpu_dp,H32); FLDCMP(ds,H16); FLDCMP(mxcsr,H32); FLDCMP(mxcsr_mask,H32); for ( i=0; i<8; i++ ) { FLTCMP(i); } for ( i=0; i<8; i++ ) { XMMCMP(i); } return rval; } /* Possible arguments to exc_raise() */ #define FP_EXC 0 #define IRQ_EXC 1 #define SSE_EXC -1 /* Check stack alignment by raising the interrupt from a * non-16-byte aligned section of code. The exception/IRQ * handler must align the stack and SSE context area * properly or it will crash. */ #define __INTRAISE(x) " int $32+"#x" \n" #define INTRAISE(x) __INTRAISE(x) __asm__ ( "do_raise: \n" " fwait \n" " test %eax, %eax \n" " je 2f \n" " jl 1f \n" INTRAISE(SSE_TEST_IRQ) " jmp 2f \n" "1: sqrtps %xmm0, %xmm0 \n" "2: \n" " ret \n" ); #define SSE_TEST_HP_FAILED 1 #define SSE_TEST_FSPR_FAILED 2 #define SSE_TEST_CTXTCMP_FAILED 4 static const char *fail_msgs[] = { "Seems that HP task was not executing", "FPSR 'Invalid-operation' flag should be clear", "Restored context does NOT match the saved one", }; static void prstat(int st, const char *where) { int i,msk; for ( i=0, msk=1; i>1); sse_test_ohdl = 0; fp_cw_set = _CPU_Null_fp_context.fpucw | FPCW_RC(3) ; mxcsr_set = _CPU_Null_fp_context.mxcsr | MXCSR_RC(3) ; asm volatile("ldmxcsr %0"::"m"(mxcsr_set)); asm volatile("fldcw %0"::"m"(fp_cw_set)); sc = rtems_semaphore_create( rtems_build_name('s','s','e','S'), 0, RTEMS_SIMPLE_BINARY_SEMAPHORE, 0, &sse_test_sync ); if ( RTEMS_SUCCESSFUL != sc ) { rtems_error(sc, "sse_test ERROR: creation of 'sync' semaphore failed"); errs++; goto bail; } rtems_task_set_priority( RTEMS_SELF, RTEMS_CURRENT_PRIORITY, &pri ); sc = rtems_task_create( rtems_build_name('s','s','e','H'), pri - 2, 20000, RTEMS_DEFAULT_MODES, RTEMS_FLOATING_POINT, &hp_task ); if ( RTEMS_SUCCESSFUL != sc ) { hp_task = 0; rtems_error( sc, "sse_test ERROR: creation of high-priority task failed"); errs++; goto bail; } sc = rtems_task_start( hp_task, sse_test_hp_task, (rtems_task_argument)sse_test_sync ); if ( RTEMS_SUCCESSFUL != sc ) { rtems_error( sc, "sse_test ERROR: start of high-priority task failed"); goto bail; } /* Test if FP/SSE context is saved/restored across an exception */ sse_test_ohdl = _currentExcHandler; _currentExcHandler = sse_test_ehdl; if ( (sse_tests & SSE_TEST_FPU_EXC) ) { if ( (st = exc_raise(FP_EXC)) ) { prstat(st,"FP_EXC"); errs++; } /* Test modified FPCW/MXCSR */ asm volatile("fstcw %0":"=m"(fp_cw)); asm volatile("stmxcsr %0":"=m"(mxcsr)); mxcsr &= ~(MXCSR_ALLE); if ( fp_cw != fp_cw_set ) { fprintf(stderr,"sse_test ERROR: FPCW mismatch (after FP_EXC): expected 0x%04"PRIx16", got 0x%04"PRIx16"\n", fp_cw_set, fp_cw); errs++; } if ( mxcsr != mxcsr_set ) { fprintf(stderr,"sse_test ERROR: MXCSR mismatch (after FP_EXC): expected 0x%08"PRIx32", got 0x%08"PRIx32"\n", mxcsr_set, mxcsr); errs++; } } if ( (sse_tests & SSE_TEST_SSE_EXC) ) { if ( (st = exc_raise(SSE_EXC)) ) { prstat(st, "SSE_EXC"); errs++; } /* Test modified FPCW/MXCSR */ asm volatile("fstcw %0":"=m"(fp_cw)); asm volatile("stmxcsr %0":"=m"(mxcsr)); mxcsr &= ~(MXCSR_ALLE); if ( fp_cw != fp_cw_set ) { fprintf(stderr,"sse_test ERROR: FPCW mismatch (after SSE_EXC): expected 0x%04"PRIx16", got 0x%04"PRIx16"\n", fp_cw_set, fp_cw); errs++; } if ( mxcsr != mxcsr_set ) { fprintf(stderr,"sse_test ERROR: MXCSR mismatch (after SSE_EXC): expected 0x%08"PRIx32", got 0x%08"PRIx32"\n", mxcsr_set, mxcsr); errs++; } } if ( (sse_tests & SSE_TEST_IRQ_EXC) ) { memset( &irqd, 0, sizeof(irqd) ); irqd.name = SSE_TEST_IRQ; irqd.hdl = (void*)sse_test_ehdl; irqd.handle = 0; if ( ! BSP_install_rtems_irq_handler( &irqd ) ) { fprintf(stderr, "sse_test ERROR: Unable to install ISR\n"); errs++; goto bail; } /* Test if FP/SSE context is saved/restored across an interrupt */ if ( (st = exc_raise(IRQ_EXC)) ) { prstat(st, "IRQ"); errs++; } if ( ! BSP_remove_rtems_irq_handler( &irqd ) ) { fprintf(stderr, "sse_test ERROR: Unable to uninstall ISR\n"); } /* Test modified FPCW/MXCSR */ asm volatile("fstcw %0":"=m"(fp_cw)); asm volatile("stmxcsr %0":"=m"(mxcsr)); mxcsr &= ~(MXCSR_ALLE); if ( fp_cw != fp_cw_set ) { fprintf(stderr,"sse_test ERROR: FPCW mismatch (after IRQ): expected 0x%04"PRIx16", got 0x%04"PRIx16"\n", fp_cw_set, fp_cw); errs++; } if ( mxcsr != mxcsr_set ) { fprintf(stderr,"sse_test ERROR: MXCSR mismatch (after IRQ): expected 0x%08"PRIx32", got 0x%08"PRIx32"\n", mxcsr_set, mxcsr); errs++; } } bail: /* Wait for console to calm down... */ rtems_task_wake_after(5); fprintf(stderr,"SSE/FPU Test %s (%u errors)\n", errs ? "FAILED":"PASSED", errs); if ( sse_test_ohdl ) { _currentExcHandler = sse_test_ohdl; sse_test_ohdl = 0; } if ( sse_test_sync ) rtems_semaphore_delete( sse_test_sync ); sse_test_sync = 0; if ( hp_task ) rtems_task_delete( hp_task ); if ( ! (flags & SSE_TEST_NO_DEL) ) rtems_task_exit(); } static void sse_test_ehdl(CPU_Exception_frame *p_f) { int i,j,start = 0; int mismatch; __vf f4; if ( p_f ) { printk("Got exception #%u\n", p_f->idtIndex); printk("EIP: 0x%08x, ESP: 0x%08x\n", p_f->eip, p_f->esp0); printk("TID: 0x%08x\n", _Thread_Executing->Object.id); if ( ! p_f->fp_ctxt ) { printk("ERROR: NO FP/SSE CONTEXT ATTACHED ??\n"); sse_test_ohdl(p_f); } if ( 16 == p_f->idtIndex ) { printk("Resetting FP status (0x%04"PRIx16")\n", p_f->fp_ctxt->fsw); p_f->fp_ctxt->fsw = 0; } else if ( 19 == p_f->idtIndex ) { start = 1; memcpy(&f4, p_f->fp_ctxt->xmmregs[0], sizeof(f4)); f4 = -f4; memcpy(p_f->fp_ctxt->xmmregs[0], &f4, sizeof(f4)); p_f->fp_ctxt->mxcsr &= ~MXCSR_ALLE; } else { printk("(skipping non-FP exception)\n"); sse_test_ohdl(p_f); } printk("Checking XMM regs -- "); for ( mismatch=0, i=start; i<8; i++ ) { for ( j=0; j<16; j++ ) { if ( p_f->fp_ctxt->xmmregs[i][j] != ((i<<4) | j) ) mismatch++; } } if ( mismatch ) { printk("%u mismatches; dump:\n", mismatch); for ( i=0; i<8; i++ ) { for ( j=0; j<16; j++ ) { printk("0x%02x ", p_f->fp_ctxt->xmmregs[i][j]); } printk("\n"); } } else { printk("OK\n"); } } else { printk("IRQ %u\n", SSE_TEST_IRQ); } printk("Clobbering FPU/SSE state\n"); asm volatile("finit"); sse_clobber(0xdeadbeef); printk("Notifying task\n"); rtems_semaphore_release( sse_test_sync ); } #else /* Code using signals for testing under linux; unfortunately, 32-bit * linux seems to pass no SSE context info to the sigaction... */ #include #include #define MKCASE(X) case FPE_##X: msg="FPE_"#X; break; #define CLRXMM(i) __asm__ volatile("pxor %%xmm"#i", %%xmm"#i:::"xmm"#i) static void fpe_act(int signum, siginfo_t *p_info, void *arg3) { ucontext_t *p_ctxt = arg3; const char *msg = "FPE_UNKNOWN"; uint16_t *p_fst; if ( SIGFPE != signum ) { fprintf(stderr,"WARNING: fpe_act handles SIGFPE\n"); return; } switch ( p_info->si_code ) { default: fprintf(stderr,"WARNING: fpe_act got unkown code %u\n", p_info->si_code); return; MKCASE(INTDIV); MKCASE(INTOVF); MKCASE(FLTDIV); MKCASE(FLTOVF); MKCASE(FLTUND); MKCASE(FLTRES); MKCASE(FLTINV); MKCASE(FLTSUB); } fprintf(stderr,"Got SIGFPE (%s) @%p\n", msg, p_info->si_addr); #ifdef __linux__ fprintf(stderr,"Resetting FP status 0x%02lx\n", p_ctxt->uc_mcontext.fpregs->sw); p_ctxt->uc_mcontext.fpregs->sw = 0; #ifdef TEST_MISMATCH fp_st1((void*)&p_ctxt->uc_mcontext.fpregs->_st[3],2.345); #endif #endif /* Clear FPU; if context is properly saved/restored around exception * then this shouldn't disturb the register contents of the interrupted * task/process. */ asm volatile("finit"); sse_clobber(0xdeadbeef); } static void test(void) { Context_Control_sse ctxt; stor_ctxt(&ctxt); printf("FPCW: 0x%"PRIx16"\nFPSW: 0x%"PRIx16"\n", ctxt.fcw, ctxt.fsw); printf("FTAG: 0x%"PRIx8"\n",ctxt.ftw); } int main(int argc, char **argv) { struct sigaction a1, a2; uint32_t mxcsr; memset(&a1, 0, sizeof(a1)); a1.sa_sigaction = fpe_act; a1.sa_flags = SA_SIGINFO; if ( sigaction(SIGFPE, &a1, &a2) ) { perror("sigaction"); return 1; } asm volatile("stmxcsr %0":"=m"(mxcsr)); printf("MXCSR: 0x%08"PRIx32"\n", mxcsr); test(); exc_raise(0); return 0; } #endif /* Helpers to access CR4 and MXCSR */ uint32_t mfcr4() { uint32_t rval; asm volatile("mov %%cr4, %0":"=r"(rval)); return rval; } void mtcr4(uint32_t rval) { asm volatile("mov %0, %%cr4"::"r"(rval)); } uint32_t mfmxcsr() { uint32_t rval; asm volatile("stmxcsr %0":"=m"(rval)); return rval; } void mtmxcsr(uint32_t rval) { asm volatile("ldmxcsr %0"::"m"(rval)); } float sseraise() { __vf f4={-2., -2., -2. -2.}; float f; f4 = __builtin_ia32_sqrtps( f4 ); memcpy(&f,&f4,sizeof(f)); return f; }