diff options
Diffstat (limited to 'c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S')
-rw-r--r-- | c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S | 783 |
1 files changed, 783 insertions, 0 deletions
diff --git a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S new file mode 100644 index 0000000000..e96e572db2 --- /dev/null +++ b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S @@ -0,0 +1,783 @@ +#ifdef __ALTIVEC__ + +#include <rtems/powerpc/powerpc.h> + +#ifndef PPC_CACHE_ALIGNMENT +#error "Missing header; PPC_CACHE_ALIGNMENT is not defined" +#endif + +#define ALTIVEC_TESTING + +#if PPC_CACHE_ALIGNMENT != 32 +#error "Altivec support assumes cache-line size is 32 bytes!" +#else +#undef LD_PPC_CACHE_ALIGNMENT +#define LD_PPC_CACHE_ALIGNMENT 5 +#endif + + .set v0, 0 + .set v8, 8 + .set v16, 16 + .set v20, 20 + .set v24, 24 + .set v28, 28 + + .set r0, 0 + .set r3, 3 + .set r4, 4 + .set r5, 5 + .set r6, 6 + .set r7, 7 + + .set r10, 10 + .set r11, 11 + .set r12, 12 + + .set cr5, 5 + + .set VECSIZE, 16 + + .set VRSAVE_INIT_VAL, 0 + .set VSCR_INIT_VAL, 0 + + .set VRSAVE_OFF, 16 + .set VSCR_OFF, 16+12 + + .set ds0, 0 + + /* Block size for dst -- in units of 16-bytes */ + .set BSIZE, 2 /* = 32 bytes */ + .set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */ + .set BSTRIDE, 32 /* bytes */ + + .data + + .global _CPU_altivec_vrsave_initval +_CPU_altivec_vrsave_initval: + .long 0 + + .global _CPU_altivec_vscr_initval +_CPU_altivec_vscr_initval: + .long 0 + + .text + + .extern _CPU_altivec_psim_cpu + .extern _CPU_altivec_ctxt_off + + .macro CMPOFF _B0 + lis \_B0, _CPU_altivec_ctxt_off@ha + lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0) + .endm + + /* Conditionally load or store a vector _VR to + * EA(_R1|0 + _R2) + * If bit _VR (corresponding to _VR) is set in CRC + * then the load/store is performed but otherwise + * it is skipped. + * If compiled with IGNORE_VRSAVE defined then + * the load/store is done unconditionally. + * + * _OPCODE: intended to be lvx, lvxl, stvx or stvxl + * _VR : target vector register + * _R1 : base register (NOTE: _R1=r0 uses a + * implicit ZERO constant, not the contents + * of r0) for address computation. + * _R2 : 'offset' register for address computation. + * + * MODIFIES: _VR on output if a load operation is performed. + * IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE + * defined. + */ + .macro LDST _OPCODE, _VR, _R1, _R2 +#ifndef IGNORE_VRSAVE + bc 4, \_VR, 111f +#endif + \_OPCODE \_VR, \_R1, \_R2 +111: + .endm + + /* + * Load or store four 'adjacent' vector registers. + * + * _OPCODE: intended to be lvx, lvxl, stvx or stvxl + * _VR : target vector register + * _R1 : base register (NOTE: _R1=r0 uses a + * implicit ZERO constant, not the contents + * of r0) for address computation. + * _B0 : base register 0 + * _B1 : base register 1 + * _B2 : base register 2 + * _B3 : base register 3 + * _RO : offset register + * + * memory addresses for _VR, _VR+1, _VR+2, _VR+3 + * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively. + * + * MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load + * operation is performed. + * IMPLICIT USE: see LDST + */ + .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO + LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO + LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO + LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO + LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO + .endm + + /* + * Preload/zero two cache lines and save 4 vector registers + * to memory. + * Note that the cache operation targets memory *past* the + * current storage area which should hopefully hit when + * This same code is executed on the next two cache lines... + * + * This code effectively does + * dcbz (_B0 + 64) + * dcbz (_B0 + 64 + 32) + * stvx _VF+0, (_B0+ 0) + * stvx _VF+1, (_B0+16) + * stvx _VF+2, (_B0+32) + * stvx _VF+3, (_B0+48) + * + * _LRU: may be 'l' or empty. The former variant should be + * used when it is conceivable that the memory area is + * unlikely to be used in the near future thus making + * it a candidate for early eviction from the caches. + * + * If it is likely that the memory area is reused soon + * (e.g., save/restore across ISR execution) then the + * 'stvx' opcode (w/o 'l' suffix) should be used. + * + * _VR: first of four target vector registers; _VR+0, + * _VR+1, _VR+2, _VR+3 are saved. + * + * _BO: base address of memory area. + * _B1: should contain _B0+16 on entry + * _B2: should contain _B0+32 on entry + * _B3: should contain _B0+48 on entry + * + * _O1: contains the offset where the four vectors are + * stored. + * _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 ) + * _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 ) + * _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 ) + * _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 ) + * _O2: is set to _O1 + 64 by this macro. Hence _O2 is + * used to address the two cache-lines past the + * current memory area. + * + * MODIFIES: _O2; contains _O1 + 64 after execution of this + * code. + * + * NOTES: a different set of four vectors can be addressed + * simply by changing the one offset register _O1. + * + * Saving more than 4 registers can simply be + * achieved by expanding this macro multiple + * times with _O1 and _O2 swapped (new _O1 + * becomes _O2 = old _O1 + 64) thus stepping + * through the memory area. + * + */ + .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT + dcbz \_B0, \_O2 + dcbz \_B2, \_O2 + LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 + .endm + + /* + * Save eight vector registers by expanding S4VEC_P twice. + * See notes for S4VEC_P above. + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * + * MODIFIES: After execution, + * _O2 contains original _O1 + 64, + * _O1 contains original _O1 + 128 + * + * NOTES: Expanding this macro multiple times lets you save + * multiple blocks of 8 registers (no reload of _Bx / _Ox is needed). + */ + .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + /* Note that the roles of _O1 and _O2 are swapped here */ + S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1 + .endm + + /* + * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1) + * + * See notes above (for S4VEC_P). + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * MODIFIES: _O1 contains original _O1 + 256 + * _O2 contains original _O1 + 256 - 64 + */ + .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 + S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 + .endm + + /* + * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1) + * + * See notes above (for S4VEC_P, S_V0TOV19). + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * MODIFIES: _O1 contains original _O1 + 128 + * _O2 contains original _O1 + 128 - 64 + */ + .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 + S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1 + .endm + + /* + * Save all registers to memory area + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * MODIFIES: _O1 contains original _O1 + 512 + * _O2 contains original _O1 + 512 - 64 + */ + .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 + S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2 + .endm + + + /* + * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively. + * We can pass either of them as arguments to another macro which + * allows us to decide if the main macro uses dcbt or not when + * we expand it... + */ + .macro DO_DCBT _RA, _RB + dcbt \_RA, \_RB + .endm + + .macro NO_DCBT _RA, _RB + .endm + + /* + * NOTE REGARDING dcbt VS dst + * + * Preloading the cache with memory areas that we soon need + * can be done either using 'dcbt' or 'dst' instructions + * "ahead of time". + * When experimenting (on a mpc7457) I found that the 'dst' + * stream instruction was very efficient if there is enough + * time to read ahead. It works well when we do a context + * switch: + * + * 1) start DST on new context to be loaded + * 2) save old context to memory + * 3) load new context from memory + * + * Because of the interleaved step 2) dst works nicely and + * 3) finds what it needs in the cache. + * + * However, in a situation when there is not much time + * to start the DST, e.g., because we want to restore + * a context out of the blue (e.g., after returning + * from and ISR): + * + * 1) save volatile registers to memory/stack + * 2) execute ISR + * 3) might do a task context switch + * 4) when returned to old task context then + * reload volatile registers from memory/stack. + * + * In this situation, preloading the target memory before + * or after step 1) makes obviously no sense because after + * 1) the registers area is most likely in the cache already. + * + * Starting preload after 2) doesn't make much sense either. + * If ISR doesn't lead to a context switch then it is quite + * likely that the register area is still in the cache. + * OTOTH, if a context switch happens then the preload after 2) + * might be useless. + * + * This leaves us at step 4) where we want to load immediately. + * In this case, I found that 'dcbt' works more efficiently + * so that's what we use when restoring volatile registers. + * + * When restoring the non-volatile VRs during a 'normal' + * context switch then we shall use DST (and no dcbt). + */ + + /* + * Symmetric to S4VEC_P above but addresses loading four + * vector registers from memory. + * + * Touches two cache lines past the current memory area + * and loads four vectors from the current area. + * + * Optionally, the DCBT operation may be omitted + * (when expanding with _DCBT=NO_DCBT). + * This is useful if the cache was already preloaded + * by another means (dst instruction). + * + * NOTE: We always use the 'LRU' form of lvx: lvxl, + * because we deem it unlikely that the context + * that was just loaded has to be saved again + * to memory in the immediate future. + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O2 contains original _O1 + 64. + * _VR.._VR+3 loaded from memory. + */ + .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT + /* preload/touch 2 lines at offset 64 from _B0 */ + \_DCBT \_B0, \_O2 + \_DCBT \_B2, \_O2 + /* load four vectors at off set 0 from _B0 */ + LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1 + .endm + + /* + * Symmetric to S8VEC_P; loads 8 vector registers + * from memory -- see comments above... + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O1 contains original _O1 + 128. + * _O2 contains original _O1 + 64. + * _VR.._VR+7 loaded from memory. + */ + .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1 + .endm + + /* + * Load volatile vector registers v0..v19 employing + * the DCBT to preload the cache. The rationale for + * using DCBT here but not when restoring non-volatile + * registers is explained above, see + * + * "NOTE REGARDING dcbt VS dst" + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O1 contains original _O1 + 256. + * _O2 contains original _O1 + 256 - 64. + * VR0..VR19 loaded from memory. + */ + .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2 + L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1 + .endm + + /* + * Load non-volatile vector registers v20..v31. + * Note that no DCBT is performed since we use + * DST for preloading the cache during a context + * switch, see + * + * "NOTE REGARDING dcbt VS dst" + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O1 contains original _O1 + 128. + * _O2 contains original _O1 + 128 - 64. + * VR20..VR31 loaded from memory. + */ + .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2 + L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1 + .endm + + /* + * Load all registers from memory area. + */ + .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 + L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2 + .endm + + /* + * Compute + * _B1 = _B0 + 16 + * _B2 = _B0 + 32 + * _B3 = _B0 + 48 + * and load + * _RO = 0 + * + * convenience macro to be expanded before + * any of the load/store macros that use + * four base addresses etc. + * + * INPUT: _B0 = cache-aligned start of memory area + * + * MODIFIES: _B1, _B2, _B3, _RO as described above. + */ + .macro CMP_BASES _B0, _B1, _B2, _B3, _RO + addi \_B1, \_B0, 1*VECSIZE + addi \_B2, \_B0, 2*VECSIZE + addi \_B3, \_B0, 3*VECSIZE + li \_RO, 0 + .endm + + /* + * Prepare for saving general vector registers. + * + * If not built with #define IGNORE_VRSAVE then + * + * 1) copy vrsave to CRC + * + * endif + * + * 2) copy vrsave to _VRSAVE_REG + * 3) preload/zero cache line where vrsave and vscr are stored. + * 4) compute base adresses from _B0 + * 5) preload/zero first two cache lines (remember that the + * first S8VEC_P starts preloading/zeroing at offset 64). + * + * INPUT: 'vrsave' register, _B0 (base address of memory area) + * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave') + * _B0 = original _BO + 32 + * _B1 = original _B0 + 32 + 16, + * _B2 = original _B0 + 32 + 32, + * _B3 = original _B0 + 32 + 48, + * CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined) + */ + .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO + mfvrsave \_VRSAVE_REG +#ifndef IGNORE_VRSAVE + mtcr \_VRSAVE_REG +#endif + dcbz 0, \_B0 + addi \_B0, \_B0, PPC_CACHE_ALIGNMENT + dcbz 0, \_B0 + CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO + dcbz 0, \_B2 + .endm + + /* + * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers + * must have been loaded from 'vrsave' and 'vscr', respectively, + * prior to expanding this macro. + * + * INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents + * _VSCR_VREG VR holding 'vscr' contents + * _B0 cache-aligned (base) address of memory area. + * MODIFIES: _SCRATCH_REG + */ + .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG + stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) + li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF + stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG + .endm + + /* + * Load 'vrsave' and 'vscr' from memory. + * + * INPUTS: _B0 cache-aligned (base) address of memory area. + * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr) + * 'vscr', 'vrsave'. + * CRC (holds contents of 'vrsave') (ONLY IF COMPILED + * with IGNORE_VRSAVE undefined). + */ + .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG + lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) + mtvrsave \_SCRATCH_REG +#ifndef IGNORE_VRSAVE + mtcr \_SCRATCH_REG +#endif + li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF + lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG + mtvscr \_SCRATCH_VREG + .endm + + /* + * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1) + * + * INPUT: _B0 + * MODIFIES: _B0 (as stated above) + */ + .macro CACHE_DOWNALGN _B0 + rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT + .endm + + .text + + .global _CPU_save_altivec_volatile +_CPU_save_altivec_volatile: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 + +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r12 +#endif + + PREP_FOR_SAVE r0, r3, r4, r5, r6, r10 + /* r0 now contains VRSAVE, r3 still the aligned memory area + * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3, + * respectively. r10 holds zero + */ + S_V0TOV19 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11 + mfvscr v0 + /* Store vrsave (still in r0) and vscr (in v0) to memory area */ + S_VSCR_VRSAVE r0, v0, r3, r11 + +#ifndef IGNORE_VRSAVE + /* Restore CRC */ + mtcr r12 +#endif + blr + + .global _CPU_load_altivec_volatile +_CPU_load_altivec_volatile: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r12 +#endif + + /* Try to preload 1st line (where vscr and vrsave are stored) */ + dcbt 0, r3 + /* Point to start of general vector-register area */ + addi r3, r3, PPC_CACHE_ALIGNMENT + /* Start preloading 2nd line (where first two vectors are) */ + dcbt 0, r3 + L_VSCR_VRSAVE r3, r0, v0 + CMP_BASES r3, r4, r5, r6, r10 + /* Start preloading 3rd line (where vectors 3 and 4 are) */ + dcbt 0, r5 + L_V0TOV19 r3, r4, r5, r6, r10, r11 + +#ifndef IGNORE_VRSAVE + mtcr r12 +#endif + blr + + .global _CPU_Context_restore_altivec +_CPU_Context_restore_altivec: + /* Restore is like 'switch' but we don't have + * to save an old context. + * Move argument to second arg and load NULL pointer + * to first one, then jump to 'switch' routine. + */ + mr r4, r3 + li r3, 0 + b _CPU_Context_switch_altivec + + .global _CPU_Context_switch_altivec +_CPU_Context_switch_altivec: + + /* fetch offset of altivec area in context */ + CMPOFF r5 + /* down-align 'to' area to cache-line boundary */ + add r4, r4, r5 + CACHE_DOWNALGN r4 + + /* Check for PSIM */ + lis r6, _CPU_altivec_psim_cpu@ha + lwz r6, _CPU_altivec_psim_cpu@l(r6) + cmpli 0, r6, 0 + bne 1f + /* Skip data-stream instructions on PSIM (not implemented) */ + dssall + /* Pre-load new context into cache */ + lis r6, (BSIZE<<(24-16)) | (BCNT<<(16-16)) + ori r6, r6, BSTRIDE + dstt r4, r6, ds0 +1: + +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r12 +#endif + + /* Is 'from' context == NULL ? (then we just do a 'restore') */ + cmpli 0, r3, 0 + beq 1f /* yes: skip saving 'from' context */ + + /* SAVE NON-VOLATILE REGISTERS */ + + /* Compute aligned destination pointer (r5 still holds offset + * to 'altivec' area in context) + */ + add r3, r3, r5 + CACHE_DOWNALGN r3 + + PREP_FOR_SAVE r0, r3, r5, r6, r7, r10 + /* The manual says reading vscr can take some time - do + * read it here (into a volatile vector register) while + * we wait for cache blocks to be allocated + */ + mfvscr v0 + S_V20TOV31 _LRU=l, _B0=r3, _B1=r5, _B2=r6, _B3=r7, _O1=r10, _O2=r11 + /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */ + S_VSCR_VRSAVE r0, v0, r3, r5 + +1: + + /* LOAD NON-VOLATILE REGISTERS */ + + /* Advance past vrsave/vscr area */ + addi r4, r4, PPC_CACHE_ALIGNMENT + L_VSCR_VRSAVE r4, r0, v0 + CMP_BASES r4, r5, r6, r7, r10 + L_V20TOV31 r4, r5, r6, r7, r10, r11 + +#ifndef IGNORE_VRSAVE + mtcr r12 +#endif + blr + + .global _CPU_Context_initialize_altivec +_CPU_Context_initialize_altivec: + CMPOFF r5 + add r3, r3, r5 + CACHE_DOWNALGN r3 + lis r5, _CPU_altivec_vrsave_initval@ha + lwz r5, _CPU_altivec_vrsave_initval@l(r5) + stw r5, VRSAVE_OFF(r3) + lis r6, _CPU_altivec_vscr_initval@ha + lwz r6, _CPU_altivec_vscr_initval@l(r6) + stw r6, VSCR_OFF(r3) + blr + + /* + * Change the initial value of VRSAVE. + * Can be used by initialization code if + * it is determined that code was compiled + * with -mvrsave=no. In this case, VRSAVE + * must be set to all-ones which causes this + * support code to save/restore *all* registers + * (only has an effect if IGNORE_VRSAVE is + * not defined -- otherwise all registers are + * saved/restored anyways). + */ + .global _CPU_altivec_set_vrsave_initval +_CPU_altivec_set_vrsave_initval: + lis r5, _CPU_altivec_vrsave_initval@ha + stw r3, _CPU_altivec_vrsave_initval@l(r5) + mtvrsave r3 + blr + +#ifdef ALTIVEC_TESTING + .global msr_VE_on +msr_VE_on: + mfmsr r3 + oris r3, r3, 1<<(31-6-16) + mtmsr r3 + blr + + .global msr_VE_off +msr_VE_off: + mfmsr r3 + lis r4, 1<<(31-6-16) + andc r3, r3, r4 + mtmsr r3 + blr + + + .global mfvrsave +mfvrsave: + mfvrsave r3 + blr + + .global mtvrsave +mtvrsave: + mtvrsave r3 + blr + + /* Load all vector registers from memory area. + * NOTE: This routine is not strictly ABI compliant -- + * it guarantees that volatile vector registers + * have certain values on exit! + */ + .global _CPU_altivec_load_all +_CPU_altivec_load_all: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r12 +#endif + + /* Try to preload 1st line (where vscr and vrsave are stored) */ + dcbt 0, r3 + /* Point to start of general vector-register area */ + addi r3, r3, PPC_CACHE_ALIGNMENT + /* Start preloading 2nd line (where first two vectors are) */ + dcbt 0, r3 + L_VSCR_VRSAVE r3, r0, v0 + CMP_BASES r3, r4, r5, r6, r10 + /* Start preloading 3rd line (where vectors 3 and 4 are) */ + dcbt 0, r5 + L_V0TOV31 r3, r4, r5, r6, r10, r11 + +#ifndef IGNORE_VRSAVE + mtcr r12 +#endif + blr + + .global _CPU_altivec_save_all +_CPU_altivec_save_all: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 + +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r12 +#endif + + PREP_FOR_SAVE r0, r3, r4, r5, r6, r10 + /* r0 now contains VRSAVE, r3 still the aligned memory area + * and r4, r5, r6 are offset by 16, 32, and 48 bytes from r3, + * respectively. r10 holds zero + */ + S_V0TOV31 _B0=r3, _B1=r4, _B2=r5, _B3=r6, _O1=r10, _O2=r11 + mfvscr v0 + /* Store vrsave (still in r0) and vscr (in v0) to memory area */ + S_VSCR_VRSAVE r0, v0, r3, r11 + +#ifndef IGNORE_VRSAVE + /* Restore CRC */ + mtcr r12 +#endif + blr + + +#if 0 + .gnu_attribute 4,1 + .gnu_attribute 8,1 +#endif + +#endif +#endif |