From fbee4ffddeef1aec126795ca602491a5549c190a Mon Sep 17 00:00:00 2001 From: Till Straumann Date: Wed, 2 Dec 2009 01:33:51 +0000 Subject: 2009-12-01 Till Straumann * Makefile.am, mpc6xx/altivec: new directory implementing support for AltiVec context saving/restoring. --- c/src/lib/libcpu/powerpc/ChangeLog | 5 + c/src/lib/libcpu/powerpc/Makefile.am | 7 + c/src/lib/libcpu/powerpc/mpc6xx/altivec/README | 184 +++++ c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c | 228 ++++++ .../libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S | 783 +++++++++++++++++++++ 5 files changed, 1207 insertions(+) create mode 100644 c/src/lib/libcpu/powerpc/mpc6xx/altivec/README create mode 100644 c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c create mode 100644 c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S (limited to 'c') diff --git a/c/src/lib/libcpu/powerpc/ChangeLog b/c/src/lib/libcpu/powerpc/ChangeLog index 2dbad33af0..6807e6b104 100644 --- a/c/src/lib/libcpu/powerpc/ChangeLog +++ b/c/src/lib/libcpu/powerpc/ChangeLog @@ -1,3 +1,8 @@ +2009-12-01 Till Straumann + + * Makefile.am, mpc6xx/altivec: new directory implementing + support for AltiVec context saving/restoring. + 2009-12-01 Till Straumann * mpc6xx/mmu/bat.c, mpc6xx/mmu/pte121.c: skip data- diff --git a/c/src/lib/libcpu/powerpc/Makefile.am b/c/src/lib/libcpu/powerpc/Makefile.am index 5de0014166..e898bc7754 100644 --- a/c/src/lib/libcpu/powerpc/Makefile.am +++ b/c/src/lib/libcpu/powerpc/Makefile.am @@ -241,7 +241,14 @@ noinst_PROGRAMS += mpc6xx/timer.rel mpc6xx_timer_rel_SOURCES = mpc6xx/timer/timer.c mpc6xx_timer_rel_CPPFLAGS = $(AM_CPPFLAGS) mpc6xx_timer_rel_LDFLAGS = $(RTEMS_RELLDFLAGS) + +# mpc6xx/altivec +noinst_PROGRAMS += mpc6xx/altivec.rel +mpc6xx_altivec_rel_SOURCES = mpc6xx/altivec/vec_sup.c mpc6xx/altivec/vec_sup_asm.S +mpc6xx_altivec_rel_CPPFLAGS = $(AM_CPPFLAGS) +mpc6xx_altivec_rel_LDFLAGS = $(RTEMS_RELLDFLAGS) endif +EXTRA_DIST += mpc6xx/altivec/README if e500 # mpc6xx/clock diff --git a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/README b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/README new file mode 100644 index 0000000000..61ebb8dded --- /dev/null +++ b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/README @@ -0,0 +1,184 @@ +RTEMS ALTIVEC SUPPORT +===================== + +1. History +---------- + +Altivec support was developed and maintained as a user-extension +outside of RTEMS. This extension is still available (unbundled) +from Till Straumann ; it is useful +if an application desires 'lazy switching' of the altivec context. + +2. Modes +-------- + +Altivec support -- the unbundled extension, that is -- can be used +in two ways: + +a. All tasks are implicitly AltiVec-enabled. + +b. Only designated tasks are AltiVec-enabled. 'Lazy-context switching' + is implemented to switch AltiVec the context. + +Note that the code implemented in this directory supports mode 'a' +and mode 'a' ONLY. For mode 'b' you need the unbundled extension +(which is completely independent of this code). + +Mode 'a' (All tasks are AltiVec-enabled) +- - - - - - - - - - - - - - - - - - - - - + +The major disadvantage of this mode is that additional overhead is +involved: tasks that never use the vector unit still save/restore +the volatile vector registers (20 registers * 16bytes each) across +every interrupt and all non-volatile registers (12 registers * 16b each) +during every context switch. + +However, saving/restoring e.g., the volatile registers is quite +fast -- on my 1GHz 7457 saving or restoring 20 vector registers +takes only about 1us or even less (if there are cache hits). + +The advantage is complete transparency to the user and full ABI +compatibility (exept for ISRs and exception handlers), see below. + +Mode 'b' (Only dedicated tasks are AltiVec-enabled) +- - - - - - - - - - - - - - - - - - - - - - - - - - + +The advantage of this mode of operation is that the vector-registers +are only saved/restored when a different, altivec-enabled task becomes +ready to run. In particular, if there is only a single altivec-enabled +task then the altivec-context *never* is switched. + +Note that this mode of operation is not supported by the code +in this directory -- you need the unbundled altivec extension +mentioned above. + +3. Compiler Options +------------------- + +Three compiler options affect AltiVec: -maltivec, -mabi=altivec and +-mvrsave=yes/no. + +-maltivec: This lets the cpp define the symbol __ALTIVEC__ and enables + gcc to emit vector instructions. Note that gcc may use the + AltiVec engine implicitly, i.e., **without you writing any + vectorized code**. + +-mabi=altivec: This option has two effects: + i) It ensures 16-byte stack alignment required by AltiVec + (even in combination with eabi which is RTEMS' default). + ii) It allows vector arguments to be passed in vector registers. + +-mvrsave=yes/no: Instructs gcc to emit code which sets the VRSAVE register + indicating which vector registers are 'currently in use'. + Because the altivec support does not use this information *) the + option has no direct affect but it is desirable to compile with + -mvrsave=no so that no unnecessary code is generated. + + *) The file vec_sup_asm.S conditionally disables usage of + the VRSAVE information if the preprocessor symbol + 'IGNORE_VRSAVE' is defined, which is the default. + + If 'IGNORE_VRSAVE' is undefined then the code *does* + use the VRSAVE information but I found that this does + not execute noticeably faster. + +IMPORTANT NOTES +=============== + +AFAIK, RTEMS uses the EABI which requires a stack alignment of only 8 bytes +which is NOT enough for AltiVec (which requires 16-byte alignment). + +There are two ways for obtaining 16-byte alignment: + +I) Compile with -mno-eabi (ordinary SYSV ABI has 16-byte alignment) +II) Compile with -mabi=altivec (extension to EABI; maintains 16-byte alignment + but also allows for passing vector arguments in vector registers) + +Note that it is crucial to compile ***absolutely everything*** with the same +ABI options (or a linker error may occur). In particular, this includes + + - newlibc multilib variant + - RTEMS proper + - application + third-party code + +IMO the proper compiler options for Mode 'a' would be + + -maltivec -mabi=altivec -mvrsave=no + +Note that the -mcpu=7400 option also enables -maltivec and -mabi=altivec +but leaves -mvrsave at some 'default' which is probably 'no'. +Compiling with -mvrsave=yes does not produce incompatible code but +may have a performance impact (since extra code is produced to maintain +VRSAVE). + +4. Multilib Variants +-------------------- + +The default GCC configuration for RTEMS contains a -mcpu=7400 multilib +variant which is the correct one to choose. + +5. BSP 'custom' file. +--------------------- + +Now that you have the necessary newlib and libgcc etc. variants +you also need to build RTEMS accordingly. + +In you BSP's make/custom/.cfg file make sure the CPU_CFLAGS +select the desired variant: + +for mode 'a': + + CPU_CFLAGS = ... -mcpu=7400 + +Note that since -maltivec globally defines __ALTIVEC__ RTEMS automatially +enables code that takes care of switching the AltiVec context as necessary. +This is transparent to application code. + +6. BSP support +-------------- + +It is the BSP's responsibility to initialize MSR_VE, VSCR and VRSAVE +during early boot, ideally before any C-code is executed (because it +may, theoretically, use vector instructions). + +The BSP must + + - set MSR_VE + - clear VRSAVE; note that the probing algorithm for detecting + whether -mvrsave=yes or 'no' was used relies on the BSP + clearing VRSAVE during early start. Since no interrupts or + context switches happen before the AltiVec support is initialized + clearing VRSAVE is no problem even if it turns out that -mvrsave=no + was in effect (eventually a value of all-ones will be stored + in VRSAVE in this case). + - clear VSCR + +7. PSIM note +------------ + +PSIM supports the AltiVec instruction set with the exception of +the 'data stream' instructions for cache prefetching. The RTEMS +altivec support includes run-time checks to skip these instruction +when executing on PSIM. + +Note that AltiVec support within PSIM must be enabled at 'configure' +time by passing the 'configure' option + +--enable-sim-float=altivec + +Note also that PSIM's AltiVec support has many bugs. It is recommended +to apply the patches filed as an attachment with gdb bug report #2461 +prior to building PSIM. + +The CPU type and corresponding multilib must be changed when +building RTEMS/psim: + + edit make/custom/psim.cfg and change + + CPU_CFLAGS = ... -mcpu=603e + + to + + CPU_CFLAGS = ... -mcpu=7400 + +This change must be performed *before* configuring RTEMS/psim. diff --git a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c new file mode 100644 index 0000000000..b23edbce8a --- /dev/null +++ b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c @@ -0,0 +1,228 @@ +/* $Id$ */ + +/* Altivec support for RTEMS; vector register context management. + * This is implemented as a user extension. + * + * Author: Till Straumann , 2005 + */ + +#ifdef __ALTIVEC__ + +#include +#include +#include +#include +#include +#include + +#define STATIC static + +#define VEC_ALIGNMENT 16 + +#define NAM "AltiVec Support" +#define ERRID(a,b,c,d) (((a)<<24) | ((b)<<16) | ((c)<<8) | (d)) + +typedef uint32_t _vu32 __attribute__((vector_size(VEC_ALIGNMENT))); + +#ifndef MSR_VE +#define MSR_VE (1<<(31-6)) +#endif + +/* NOTE: These two variables are accessed by assembly code + * which assumes 32-bit data! + */ +uint32_t _CPU_altivec_ctxt_off = 0; +uint32_t _CPU_altivec_psim_cpu = 0; + +static inline uint32_t +mfmsr(void) +{ +uint32_t v; + _CPU_MSR_GET(v); + return v; +} + +static inline void +mtmsr(uint32_t v) +{ + _CPU_MSR_SET(v); +} + +static inline void +isync(void) +{ + asm volatile("isync"); +} + +static inline void +dssall(void) +{ + if ( !_CPU_altivec_psim_cpu) + asm volatile("dssall"); +} + +static inline uint32_t +set_MSR_VE(void) +{ +uint32_t rval; + rval=mfmsr(); + if ( ! (MSR_VE & rval ) ) { + mtmsr(rval | MSR_VE); + isync(); + } + return rval; +} + +static inline void +clr_MSR_VE(void) +{ + dssall(); + mtmsr(mfmsr() & ~MSR_VE); + isync(); +} + +static inline void +rst_MSR_VE(uint32_t old) +{ + if ( ! ( MSR_VE & old ) ) { + dssall(); + mtmsr(old); + isync(); + } +} + + +/* Code to probe the compiler's stack alignment (PowerPC); + * The routine determines at run-time if the compiler generated + * 8 or 16-byte aligned code. + * + * Till Straumann , 2005 + */ + +static void dummy(void) __attribute__((noinline)); +/* add (empty) asm statement to make sure this isn't optimized away */ +static void dummy(void) { asm volatile(""); } + +static unsigned probe_r1(void) __attribute__((noinline)); +static unsigned probe_r1(void) +{ +unsigned r1; + /* call something to enforce creation of a minimal stack frame; + * (8 bytes: r1 and lr space for 'dummy' callee). If compiled + * with -meabi -mno-altivec gcc allocates 8 bytes, if -mno-eabi + * or -maltivec / -mabi=altivec then gcc allocates 16 bytes + * according to the sysv / altivec ABI specs. + */ + dummy(); + /* return stack pointer */ + asm volatile("mr %0,1":"=r"(r1)); + return r1; +} + +static unsigned +probe_ppc_stack_alignment(void) +{ +unsigned r1; + asm volatile("mr %0,1":"=r"(r1)); + return (r1 - probe_r1()) & ~ 0xf; +} + +STATIC int check_stack_alignment(void) +{ +int rval = 0; + if ( VEC_ALIGNMENT > PPC_STACK_ALIGNMENT ) { + printk(NAM": CPU support has unsufficient stack alignment;\n"); + printk("modify 'cpukit/score/cpu/powerpc/rtems/score/powerpc.h'\n"); + printk("and choose PPC_ABI_SVR4. I'll enable a workaround for now.\n"); + rval |= 1; + } + /* Run-time check; should compile with -mabi=altivec */ + if ( probe_ppc_stack_alignment() < VEC_ALIGNMENT ) { + printk(NAM": run-time stack alignment unsufficient; make sure you compile with -mabi=altivec\n"); + rval |= 2; + } + return rval; +} + + +static uint32_t probe_vrsave(_vu32 *p_v) __attribute__((noinline)); + +/* Check if this code was compiled with -mvrsave=yes or no + * so that we can set the default/init value accordingly. + */ +static uint32_t probe_vrsave(_vu32 *p_v) +{ +_vu32 x; +uint32_t vrsave; + /* Explicitly clobber a volatile vector reg (0) that is + * not used to pass return values. + * If -mvrsave=yes was used this should cause gcc to + * set bit 0 in vrsave. OTOH this bit cannot be set + * because v0 is volatile and not used to pass a value + * to the caller... + */ + asm volatile("vxor %0, 0, 0; mfvrsave %1":"=v"(x),"=r"(vrsave)::"v0"); + if ( p_v ) { + *p_v = x; + } + return vrsave; +} + +static int vrsave_yes(void) __attribute__((noinline)); + +static int vrsave_yes(void) +{ +uint32_t vrsave_pre; + asm volatile("mfvrsave %0":"=r"(vrsave_pre)); + if ( (vrsave_pre & 0x80000000) ) { + printk(NAM": WARNING - unable to determine whether -mvrsave was used; assuming NO\n"); + return 0; + } + return probe_vrsave(0) != vrsave_pre; +} + +extern void +_CPU_altivec_set_vrsave_initval(uint32_t); + + +void +_CPU_Initialize_altivec(void) +{ +unsigned pvr; + + /* I don't like to have to #define the offset of the altivec area + * for use by assembly code. + * Therefore, we compute it here and store it in memory... + */ + _CPU_altivec_ctxt_off = (uint32_t) &((Context_Control*)0)->altivec; + /* + * Add space possibly needed for alignment + */ + _CPU_altivec_ctxt_off += PPC_CACHE_ALIGNMENT - 1; + + if ( ! vrsave_yes() ) { + /* They seemed to compile with -mvrsave=no. Hence we + * must set VRSAVE so that all registers are saved/restored + * in case this support was not built with IGNORE_VRSAVE. + */ + _CPU_altivec_set_vrsave_initval( -1 ); + } + + if ( check_stack_alignment() & 2 ) + rtems_fatal_error_occurred(ERRID('V','E','C','1')); + + pvr = get_ppc_cpu_type(); + /* psim has altivec but lacks the streaming instructions :-( */ + _CPU_altivec_psim_cpu = (PPC_PSIM == pvr); + + if ( ! ppc_cpu_has_altivec() ) { + printk(NAM": This CPU seems not to have AltiVec\n"); + rtems_panic("Unable to initialize AltiVec Support\n"); + } + + if ( ! (mfmsr() & MSR_VE) ) { + printk(NAM": Warning: BSP should set MSR_VE early; doing it now...\n"); + set_MSR_VE(); + } +} +#endif diff --git a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S new file mode 100644 index 0000000000..e96e572db2 --- /dev/null +++ b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S @@ -0,0 +1,783 @@ +#ifdef __ALTIVEC__ + +#include + +#ifndef PPC_CACHE_ALIGNMENT +#error "Missing header; PPC_CACHE_ALIGNMENT is not defined" +#endif + +#define ALTIVEC_TESTING + +#if PPC_CACHE_ALIGNMENT != 32 +#error "Altivec support assumes cache-line size is 32 bytes!" +#else +#undef LD_PPC_CACHE_ALIGNMENT +#define LD_PPC_CACHE_ALIGNMENT 5 +#endif + + .set v0, 0 + .set v8, 8 + .set v16, 16 + .set v20, 20 + .set v24, 24 + .set v28, 28 + + .set r0, 0 + .set r3, 3 + .set r4, 4 + .set r5, 5 + .set r6, 6 + .set r7, 7 + + .set r10, 10 + .set r11, 11 + .set r12, 12 + + .set cr5, 5 + + .set VECSIZE, 16 + + .set VRSAVE_INIT_VAL, 0 + .set VSCR_INIT_VAL, 0 + + .set VRSAVE_OFF, 16 + .set VSCR_OFF, 16+12 + + .set ds0, 0 + + /* Block size for dst -- in units of 16-bytes */ + .set BSIZE, 2 /* = 32 bytes */ + .set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */ + .set BSTRIDE, 32 /* bytes */ + + .data + + .global _CPU_altivec_vrsave_initval +_CPU_altivec_vrsave_initval: + .long 0 + + .global _CPU_altivec_vscr_initval +_CPU_altivec_vscr_initval: + .long 0 + + .text + + .extern _CPU_altivec_psim_cpu + .extern _CPU_altivec_ctxt_off + + .macro CMPOFF _B0 + lis \_B0, _CPU_altivec_ctxt_off@ha + lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0) + .endm + + /* Conditionally load or store a vector _VR to + * EA(_R1|0 + _R2) + * If bit _VR (corresponding to _VR) is set in CRC + * then the load/store is performed but otherwise + * it is skipped. + * If compiled with IGNORE_VRSAVE defined then + * the load/store is done unconditionally. + * + * _OPCODE: intended to be lvx, lvxl, stvx or stvxl + * _VR : target vector register + * _R1 : base register (NOTE: _R1=r0 uses a + * implicit ZERO constant, not the contents + * of r0) for address computation. + * _R2 : 'offset' register for address computation. + * + * MODIFIES: _VR on output if a load operation is performed. + * IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE + * defined. + */ + .macro LDST _OPCODE, _VR, _R1, _R2 +#ifndef IGNORE_VRSAVE + bc 4, \_VR, 111f +#endif + \_OPCODE \_VR, \_R1, \_R2 +111: + .endm + + /* + * Load or store four 'adjacent' vector registers. + * + * _OPCODE: intended to be lvx, lvxl, stvx or stvxl + * _VR : target vector register + * _R1 : base register (NOTE: _R1=r0 uses a + * implicit ZERO constant, not the contents + * of r0) for address computation. + * _B0 : base register 0 + * _B1 : base register 1 + * _B2 : base register 2 + * _B3 : base register 3 + * _RO : offset register + * + * memory addresses for _VR, _VR+1, _VR+2, _VR+3 + * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively. + * + * MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load + * operation is performed. + * IMPLICIT USE: see LDST + */ + .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO + LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO + LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO + LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO + LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO + .endm + + /* + * Preload/zero two cache lines and save 4 vector registers + * to memory. + * Note that the cache operation targets memory *past* the + * current storage area which should hopefully hit when + * This same code is executed on the next two cache lines... + * + * This code effectively does + * dcbz (_B0 + 64) + * dcbz (_B0 + 64 + 32) + * stvx _VF+0, (_B0+ 0) + * stvx _VF+1, (_B0+16) + * stvx _VF+2, (_B0+32) + * stvx _VF+3, (_B0+48) + * + * _LRU: may be 'l' or empty. The former variant should be + * used when it is conceivable that the memory area is + * unlikely to be used in the near future thus making + * it a candidate for early eviction from the caches. + * + * If it is likely that the memory area is reused soon + * (e.g., save/restore across ISR execution) then the + * 'stvx' opcode (w/o 'l' suffix) should be used. + * + * _VR: first of four target vector registers; _VR+0, + * _VR+1, _VR+2, _VR+3 are saved. + * + * _BO: base address of memory area. + * _B1: should contain _B0+16 on entry + * _B2: should contain _B0+32 on entry + * _B3: should contain _B0+48 on entry + * + * _O1: contains the offset where the four vectors are + * stored. + * _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 ) + * _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 ) + * _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 ) + * _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 ) + * _O2: is set to _O1 + 64 by this macro. Hence _O2 is + * used to address the two cache-lines past the + * current memory area. + * + * MODIFIES: _O2; contains _O1 + 64 after execution of this + * code. + * + * NOTES: a different set of four vectors can be addressed + * simply by changing the one offset register _O1. + * + * Saving more than 4 registers can simply be + * achieved by expanding this macro multiple + * times with _O1 and _O2 swapped (new _O1 + * becomes _O2 = old _O1 + 64) thus stepping + * through the memory area. + * + */ + .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT + dcbz \_B0, \_O2 + dcbz \_B2, \_O2 + LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 + .endm + + /* + * Save eight vector registers by expanding S4VEC_P twice. + * See notes for S4VEC_P above. + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * + * MODIFIES: After execution, + * _O2 contains original _O1 + 64, + * _O1 contains original _O1 + 128 + * + * NOTES: Expanding this macro multiple times lets you save + * multiple blocks of 8 registers (no reload of _Bx / _Ox is needed). + */ + .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + /* Note that the roles of _O1 and _O2 are swapped here */ + S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1 + .endm + + /* + * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1) + * + * See notes above (for S4VEC_P). + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * MODIFIES: _O1 contains original _O1 + 256 + * _O2 contains original _O1 + 256 - 64 + */ + .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 + S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 + .endm + + /* + * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1) + * + * See notes above (for S4VEC_P, S_V0TOV19). + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * MODIFIES: _O1 contains original _O1 + 128 + * _O2 contains original _O1 + 128 - 64 + */ + .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 + S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1 + .endm + + /* + * Save all registers to memory area + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * MODIFIES: _O1 contains original _O1 + 512 + * _O2 contains original _O1 + 512 - 64 + */ + .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 + S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2 + .endm + + + /* + * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively. + * We can pass either of them as arguments to another macro which + * allows us to decide if the main macro uses dcbt or not when + * we expand it... + */ + .macro DO_DCBT _RA, _RB + dcbt \_RA, \_RB + .endm + + .macro NO_DCBT _RA, _RB + .endm + + /* + * NOTE REGARDING dcbt VS dst + * + * Preloading the cache with memory areas that we soon need + * can be done either using 'dcbt' or 'dst' instructions + * "ahead of time". + * When experimenting (on a mpc7457) I found that the 'dst' + * stream instruction was very efficient if there is enough + * time to read ahead. It works well when we do a context + * switch: + * + * 1) start DST on new context to be loaded + * 2) save old context to memory + * 3) load new context from memory + * + * Because of the interleaved step 2) dst works nicely and + * 3) finds what it needs in the cache. + * + * However, in a situation when there is not much time + * to start the DST, e.g., because we want to restore + * a context out of the blue (e.g., after returning + * from and ISR): + * + * 1) save volatile registers to memory/stack + * 2) execute ISR + * 3) might do a task context switch + * 4) when returned to old task context then + * reload volatile registers from memory/stack. + * + * In this situation, preloading the target memory before + * or after step 1) makes obviously no sense because after + * 1) the registers area is most likely in the cache already. + * + * Starting preload after 2) doesn't make much sense either. + * If ISR doesn't lead to a context switch then it is quite + * likely that the register area is still in the cache. + * OTOTH, if a context switch happens then the preload after 2) + * might be useless. + * + * This leaves us at step 4) where we want to load immediately. + * In this case, I found that 'dcbt' works more efficiently + * so that's what we use when restoring volatile registers. + * + * When restoring the non-volatile VRs during a 'normal' + * context switch then we shall use DST (and no dcbt). + */ + + /* + * Symmetric to S4VEC_P above but addresses loading four + * vector registers from memory. + * + * Touches two cache lines past the current memory area + * and loads four vectors from the current area. + * + * Optionally, the DCBT operation may be omitted + * (when expanding with _DCBT=NO_DCBT). + * This is useful if the cache was already preloaded + * by another means (dst instruction). + * + * NOTE: We always use the 'LRU' form of lvx: lvxl, + * because we deem it unlikely that the context + * that was just loaded has to be saved again + * to memory in the immediate future. + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O2 contains original _O1 + 64. + * _VR.._VR+3 loaded from memory. + */ + .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT + /* preload/touch 2 lines at offset 64 from _B0 */ + \_DCBT \_B0, \_O2 + \_DCBT \_B2, \_O2 + /* load four vectors at off set 0 from _B0 */ + LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1 + .endm + + /* + * Symmetric to S8VEC_P; loads 8 vector registers + * from memory -- see comments above... + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O1 contains original _O1 + 128. + * _O2 contains original _O1 + 64. + * _VR.._VR+7 loaded from memory. + */ + .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1 + .endm + + /* + * Load volatile vector registers v0..v19 employing + * the DCBT to preload the cache. The rationale for + * using DCBT here but not when restoring non-volatile + * registers is explained above, see + * + * "NOTE REGARDING dcbt VS dst" + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O1 contains original _O1 + 256. + * _O2 contains original _O1 + 256 - 64. + * VR0..VR19 loaded from memory. + */ + .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2 + L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1 + .endm + + /* + * Load non-volatile vector registers v20..v31. + * Note that no DCBT is performed since we use + * DST for preloading the cache during a context + * switch, see + * + * "NOTE REGARDING dcbt VS dst" + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O1 contains original _O1 + 128. + * _O2 contains original _O1 + 128 - 64. + * VR20..VR31 loaded from memory. + */ + .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2 + L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1 + .endm + + /* + * Load all registers from memory area. + */ + .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 + L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2 + .endm + + /* + * Compute + * _B1 = _B0 + 16 + * _B2 = _B0 + 32 + * _B3 = _B0 + 48 + * and load + * _RO = 0 + * + * convenience macro to be expanded before + * any of the load/store macros that use + * four base addresses etc. + * + * INPUT: _B0 = cache-aligned start of memory area + * + * MODIFIES: _B1, _B2, _B3, _RO as described above. + */ + .macro CMP_BASES _B0, _B1, _B2, _B3, _RO + addi \_B1, \_B0, 1*VECSIZE + addi \_B2, \_B0, 2*VECSIZE + addi \_B3, \_B0, 3*VECSIZE + li \_RO, 0 + .endm + + /* + * Prepare for saving general vector registers. + * + * If not built with #define IGNORE_VRSAVE then + * + * 1) copy vrsave to CRC + * + * endif + * + * 2) copy vrsave to _VRSAVE_REG + * 3) preload/zero cache line where vrsave and vscr are stored. + * 4) compute base adresses from _B0 + * 5) preload/zero first two cache lines (remember that the + * first S8VEC_P starts preloading/zeroing at offset 64). + * + * INPUT: 'vrsave' register, _B0 (base address of memory area) + * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave') + * _B0 = original _BO + 32 + * _B1 = original _B0 + 32 + 16, + * _B2 = original _B0 + 32 + 32, + * _B3 = original _B0 + 32 + 48, + * CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined) + */ + .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO + mfvrsave \_VRSAVE_REG +#ifndef IGNORE_VRSAVE + mtcr \_VRSAVE_REG +#endif + dcbz 0, \_B0 + addi \_B0, \_B0, PPC_CACHE_ALIGNMENT + dcbz 0, \_B0 + CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO + dcbz 0, \_B2 + .endm + + /* + * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers + * must have been loaded from 'vrsave' and 'vscr', respectively, + * prior to expanding this macro. + * + * INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents + * _VSCR_VREG VR holding 'vscr' contents + * _B0 cache-aligned (base) address of memory area. + * MODIFIES: _SCRATCH_REG + */ + .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG + stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) + li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF + stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG + .endm + + /* + * Load 'vrsave' and 'vscr' from memory. + * + * INPUTS: _B0 cache-aligned (base) address of memory area. + * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr) + * 'vscr', 'vrsave'. + * CRC (holds contents of 'vrsave') (ONLY IF COMPILED + * with IGNORE_VRSAVE undefined). + */ + .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG + lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) + mtvrsave \_SCRATCH_REG +#ifndef IGNORE_VRSAVE + mtcr \_SCRATCH_REG +#endif + li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF + lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG + mtvscr \_SCRATCH_VREG + .endm + + /* + * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1) + * + * INPUT: _B0 + * MODIFIES: _B0 (as stated above) + */ + .macro CACHE_DOWNALGN _B0 + rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT + .endm + + .text + + .global _CPU_save_altivec_volatile +_CPU_save_altivec_volatile: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 + +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r12 +#endif + + PREP_FOR_SAVE r0, r3, r4, r5, r6, r10 + /* r0 now contains VRSAVE, r3 still the aligned memory area + * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3, + * respectively. r10 holds zero + */ + S_V0TOV19 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11 + mfvscr v0 + /* Store vrsave (still in r0) and vscr (in v0) to memory area */ + S_VSCR_VRSAVE r0, v0, r3, r11 + +#ifndef IGNORE_VRSAVE + /* Restore CRC */ + mtcr r12 +#endif + blr + + .global _CPU_load_altivec_volatile +_CPU_load_altivec_volatile: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r12 +#endif + + /* Try to preload 1st line (where vscr and vrsave are stored) */ + dcbt 0, r3 + /* Point to start of general vector-register area */ + addi r3, r3, PPC_CACHE_ALIGNMENT + /* Start preloading 2nd line (where first two vectors are) */ + dcbt 0, r3 + L_VSCR_VRSAVE r3, r0, v0 + CMP_BASES r3, r4, r5, r6, r10 + /* Start preloading 3rd line (where vectors 3 and 4 are) */ + dcbt 0, r5 + L_V0TOV19 r3, r4, r5, r6, r10, r11 + +#ifndef IGNORE_VRSAVE + mtcr r12 +#endif + blr + + .global _CPU_Context_restore_altivec +_CPU_Context_restore_altivec: + /* Restore is like 'switch' but we don't have + * to save an old context. + * Move argument to second arg and load NULL pointer + * to first one, then jump to 'switch' routine. + */ + mr r4, r3 + li r3, 0 + b _CPU_Context_switch_altivec + + .global _CPU_Context_switch_altivec +_CPU_Context_switch_altivec: + + /* fetch offset of altivec area in context */ + CMPOFF r5 + /* down-align 'to' area to cache-line boundary */ + add r4, r4, r5 + CACHE_DOWNALGN r4 + + /* Check for PSIM */ + lis r6, _CPU_altivec_psim_cpu@ha + lwz r6, _CPU_altivec_psim_cpu@l(r6) + cmpli 0, r6, 0 + bne 1f + /* Skip data-stream instructions on PSIM (not implemented) */ + dssall + /* Pre-load new context into cache */ + lis r6, (BSIZE<<(24-16)) | (BCNT<<(16-16)) + ori r6, r6, BSTRIDE + dstt r4, r6, ds0 +1: + +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r12 +#endif + + /* Is 'from' context == NULL ? (then we just do a 'restore') */ + cmpli 0, r3, 0 + beq 1f /* yes: skip saving 'from' context */ + + /* SAVE NON-VOLATILE REGISTERS */ + + /* Compute aligned destination pointer (r5 still holds offset + * to 'altivec' area in context) + */ + add r3, r3, r5 + CACHE_DOWNALGN r3 + + PREP_FOR_SAVE r0, r3, r5, r6, r7, r10 + /* The manual says reading vscr can take some time - do + * read it here (into a volatile vector register) while + * we wait for cache blocks to be allocated + */ + mfvscr v0 + S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11 + /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */ + S_VSCR_VRSAVE r0, v0, r3, r5 + +1: + + /* LOAD NON-VOLATILE REGISTERS */ + + /* Advance past vrsave/vscr area */ + addi r4, r4, PPC_CACHE_ALIGNMENT + L_VSCR_VRSAVE r4, r0, v0 + CMP_BASES r4, r5, r6, r7, r10 + L_V20TOV31 r4, r5, r6, r7, r10, r11 + +#ifndef IGNORE_VRSAVE + mtcr r12 +#endif + blr + + .global _CPU_Context_initialize_altivec +_CPU_Context_initialize_altivec: + CMPOFF r5 + add r3, r3, r5 + CACHE_DOWNALGN r3 + lis r5, _CPU_altivec_vrsave_initval@ha + lwz r5, _CPU_altivec_vrsave_initval@l(r5) + stw r5, VRSAVE_OFF(r3) + lis r6, _CPU_altivec_vscr_initval@ha + lwz r6, _CPU_altivec_vscr_initval@l(r6) + stw r6, VSCR_OFF(r3) + blr + + /* + * Change the initial value of VRSAVE. + * Can be used by initialization code if + * it is determined that code was compiled + * with -mvrsave=no. In this case, VRSAVE + * must be set to all-ones which causes this + * support code to save/restore *all* registers + * (only has an effect if IGNORE_VRSAVE is + * not defined -- otherwise all registers are + * saved/restored anyways). + */ + .global _CPU_altivec_set_vrsave_initval +_CPU_altivec_set_vrsave_initval: + lis r5, _CPU_altivec_vrsave_initval@ha + stw r3, _CPU_altivec_vrsave_initval@l(r5) + mtvrsave r3 + blr + +#ifdef ALTIVEC_TESTING + .global msr_VE_on +msr_VE_on: + mfmsr r3 + oris r3, r3, 1<<(31-6-16) + mtmsr r3 + blr + + .global msr_VE_off +msr_VE_off: + mfmsr r3 + lis r4, 1<<(31-6-16) + andc r3, r3, r4 + mtmsr r3 + blr + + + .global mfvrsave +mfvrsave: + mfvrsave r3 + blr + + .global mtvrsave +mtvrsave: + mtvrsave r3 + blr + + /* Load all vector registers from memory area. + * NOTE: This routine is not strictly ABI compliant -- + * it guarantees that volatile vector registers + * have certain values on exit! + */ + .global _CPU_altivec_load_all +_CPU_altivec_load_all: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r12 +#endif + + /* Try to preload 1st line (where vscr and vrsave are stored) */ + dcbt 0, r3 + /* Point to start of general vector-register area */ + addi r3, r3, PPC_CACHE_ALIGNMENT + /* Start preloading 2nd line (where first two vectors are) */ + dcbt 0, r3 + L_VSCR_VRSAVE r3, r0, v0 + CMP_BASES r3, r4, r5, r6, r10 + /* Start preloading 3rd line (where vectors 3 and 4 are) */ + dcbt 0, r5 + L_V0TOV31 r3, r4, r5, r6, r10, r11 + +#ifndef IGNORE_VRSAVE + mtcr r12 +#endif + blr + + .global _CPU_altivec_save_all +_CPU_altivec_save_all: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 + +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r12 +#endif + + PREP_FOR_SAVE r0, r3, r4, r5, r6, r10 + /* r0 now contains VRSAVE, r3 still the aligned memory area + * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3, + * respectively. r10 holds zero + */ + S_V0TOV31 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11 + mfvscr v0 + /* Store vrsave (still in r0) and vscr (in v0) to memory area */ + S_VSCR_VRSAVE r0, v0, r3, r11 + +#ifndef IGNORE_VRSAVE + /* Restore CRC */ + mtcr r12 +#endif + blr + + +#if 0 + .gnu_attribute 4,1 + .gnu_attribute 8,1 +#endif + +#endif +#endif -- cgit v1.2.3