From 4fd1ff0f0d8d1e3029f488a011acd83115dccdef Mon Sep 17 00:00:00 2001 From: Sebastian Huber Date: Mon, 26 Mar 2018 06:57:10 +0200 Subject: bsps/powerpc: Move AltiVec support to bsps This patch is a part of the BSP source reorganization. Update #3285. --- bsps/powerpc/shared/altivec/README | 184 +++++ bsps/powerpc/shared/altivec/vec_sup.c | 273 +++++++ bsps/powerpc/shared/altivec/vec_sup_asm.S | 821 +++++++++++++++++++++ c/src/lib/libbsp/powerpc/beatnik/Makefile.am | 12 +- .../libbsp/powerpc/motorola_powerpc/Makefile.am | 6 +- c/src/lib/libbsp/powerpc/mvme5500/Makefile.am | 10 +- c/src/lib/libbsp/powerpc/psim/Makefile.am | 4 +- c/src/lib/libcpu/powerpc/Makefile.am | 9 - c/src/lib/libcpu/powerpc/mpc6xx/altivec/README | 184 ----- c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c | 273 ------- .../libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S | 821 --------------------- 11 files changed, 1291 insertions(+), 1306 deletions(-) create mode 100644 bsps/powerpc/shared/altivec/README create mode 100644 bsps/powerpc/shared/altivec/vec_sup.c create mode 100644 bsps/powerpc/shared/altivec/vec_sup_asm.S delete mode 100644 c/src/lib/libcpu/powerpc/mpc6xx/altivec/README delete mode 100644 c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c delete mode 100644 c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S diff --git a/bsps/powerpc/shared/altivec/README b/bsps/powerpc/shared/altivec/README new file mode 100644 index 0000000000..61ebb8dded --- /dev/null +++ b/bsps/powerpc/shared/altivec/README @@ -0,0 +1,184 @@ +RTEMS ALTIVEC SUPPORT +===================== + +1. History +---------- + +Altivec support was developed and maintained as a user-extension +outside of RTEMS. This extension is still available (unbundled) +from Till Straumann ; it is useful +if an application desires 'lazy switching' of the altivec context. + +2. Modes +-------- + +Altivec support -- the unbundled extension, that is -- can be used +in two ways: + +a. All tasks are implicitly AltiVec-enabled. + +b. Only designated tasks are AltiVec-enabled. 'Lazy-context switching' + is implemented to switch AltiVec the context. + +Note that the code implemented in this directory supports mode 'a' +and mode 'a' ONLY. For mode 'b' you need the unbundled extension +(which is completely independent of this code). + +Mode 'a' (All tasks are AltiVec-enabled) +- - - - - - - - - - - - - - - - - - - - - + +The major disadvantage of this mode is that additional overhead is +involved: tasks that never use the vector unit still save/restore +the volatile vector registers (20 registers * 16bytes each) across +every interrupt and all non-volatile registers (12 registers * 16b each) +during every context switch. + +However, saving/restoring e.g., the volatile registers is quite +fast -- on my 1GHz 7457 saving or restoring 20 vector registers +takes only about 1us or even less (if there are cache hits). + +The advantage is complete transparency to the user and full ABI +compatibility (exept for ISRs and exception handlers), see below. + +Mode 'b' (Only dedicated tasks are AltiVec-enabled) +- - - - - - - - - - - - - - - - - - - - - - - - - - + +The advantage of this mode of operation is that the vector-registers +are only saved/restored when a different, altivec-enabled task becomes +ready to run. In particular, if there is only a single altivec-enabled +task then the altivec-context *never* is switched. + +Note that this mode of operation is not supported by the code +in this directory -- you need the unbundled altivec extension +mentioned above. + +3. Compiler Options +------------------- + +Three compiler options affect AltiVec: -maltivec, -mabi=altivec and +-mvrsave=yes/no. + +-maltivec: This lets the cpp define the symbol __ALTIVEC__ and enables + gcc to emit vector instructions. Note that gcc may use the + AltiVec engine implicitly, i.e., **without you writing any + vectorized code**. + +-mabi=altivec: This option has two effects: + i) It ensures 16-byte stack alignment required by AltiVec + (even in combination with eabi which is RTEMS' default). + ii) It allows vector arguments to be passed in vector registers. + +-mvrsave=yes/no: Instructs gcc to emit code which sets the VRSAVE register + indicating which vector registers are 'currently in use'. + Because the altivec support does not use this information *) the + option has no direct affect but it is desirable to compile with + -mvrsave=no so that no unnecessary code is generated. + + *) The file vec_sup_asm.S conditionally disables usage of + the VRSAVE information if the preprocessor symbol + 'IGNORE_VRSAVE' is defined, which is the default. + + If 'IGNORE_VRSAVE' is undefined then the code *does* + use the VRSAVE information but I found that this does + not execute noticeably faster. + +IMPORTANT NOTES +=============== + +AFAIK, RTEMS uses the EABI which requires a stack alignment of only 8 bytes +which is NOT enough for AltiVec (which requires 16-byte alignment). + +There are two ways for obtaining 16-byte alignment: + +I) Compile with -mno-eabi (ordinary SYSV ABI has 16-byte alignment) +II) Compile with -mabi=altivec (extension to EABI; maintains 16-byte alignment + but also allows for passing vector arguments in vector registers) + +Note that it is crucial to compile ***absolutely everything*** with the same +ABI options (or a linker error may occur). In particular, this includes + + - newlibc multilib variant + - RTEMS proper + - application + third-party code + +IMO the proper compiler options for Mode 'a' would be + + -maltivec -mabi=altivec -mvrsave=no + +Note that the -mcpu=7400 option also enables -maltivec and -mabi=altivec +but leaves -mvrsave at some 'default' which is probably 'no'. +Compiling with -mvrsave=yes does not produce incompatible code but +may have a performance impact (since extra code is produced to maintain +VRSAVE). + +4. Multilib Variants +-------------------- + +The default GCC configuration for RTEMS contains a -mcpu=7400 multilib +variant which is the correct one to choose. + +5. BSP 'custom' file. +--------------------- + +Now that you have the necessary newlib and libgcc etc. variants +you also need to build RTEMS accordingly. + +In you BSP's make/custom/.cfg file make sure the CPU_CFLAGS +select the desired variant: + +for mode 'a': + + CPU_CFLAGS = ... -mcpu=7400 + +Note that since -maltivec globally defines __ALTIVEC__ RTEMS automatially +enables code that takes care of switching the AltiVec context as necessary. +This is transparent to application code. + +6. BSP support +-------------- + +It is the BSP's responsibility to initialize MSR_VE, VSCR and VRSAVE +during early boot, ideally before any C-code is executed (because it +may, theoretically, use vector instructions). + +The BSP must + + - set MSR_VE + - clear VRSAVE; note that the probing algorithm for detecting + whether -mvrsave=yes or 'no' was used relies on the BSP + clearing VRSAVE during early start. Since no interrupts or + context switches happen before the AltiVec support is initialized + clearing VRSAVE is no problem even if it turns out that -mvrsave=no + was in effect (eventually a value of all-ones will be stored + in VRSAVE in this case). + - clear VSCR + +7. PSIM note +------------ + +PSIM supports the AltiVec instruction set with the exception of +the 'data stream' instructions for cache prefetching. The RTEMS +altivec support includes run-time checks to skip these instruction +when executing on PSIM. + +Note that AltiVec support within PSIM must be enabled at 'configure' +time by passing the 'configure' option + +--enable-sim-float=altivec + +Note also that PSIM's AltiVec support has many bugs. It is recommended +to apply the patches filed as an attachment with gdb bug report #2461 +prior to building PSIM. + +The CPU type and corresponding multilib must be changed when +building RTEMS/psim: + + edit make/custom/psim.cfg and change + + CPU_CFLAGS = ... -mcpu=603e + + to + + CPU_CFLAGS = ... -mcpu=7400 + +This change must be performed *before* configuring RTEMS/psim. diff --git a/bsps/powerpc/shared/altivec/vec_sup.c b/bsps/powerpc/shared/altivec/vec_sup.c new file mode 100644 index 0000000000..141779c175 --- /dev/null +++ b/bsps/powerpc/shared/altivec/vec_sup.c @@ -0,0 +1,273 @@ +/* Altivec support for RTEMS; vector register context management. */ + +/* + * Authorship + * ---------- + * This software was created by + * Till Straumann , 2009, + * Stanford Linear Accelerator Center, Stanford University. + * + * Acknowledgement of sponsorship + * ------------------------------ + * This software was produced by + * the Stanford Linear Accelerator Center, Stanford University, + * under Contract DE-AC03-76SFO0515 with the Department of Energy. + * + * Government disclaimer of liability + * ---------------------------------- + * Neither the United States nor the United States Department of Energy, + * nor any of their employees, makes any warranty, express or implied, or + * assumes any legal liability or responsibility for the accuracy, + * completeness, or usefulness of any data, apparatus, product, or process + * disclosed, or represents that its use would not infringe privately owned + * rights. + * + * Stanford disclaimer of liability + * -------------------------------- + * Stanford University makes no representations or warranties, express or + * implied, nor assumes any liability for the use of this software. + * + * Stanford disclaimer of copyright + * -------------------------------- + * Stanford University, owner of the copyright, hereby disclaims its + * copyright and all other rights in this software. Hence, anyone may + * freely use it for any purpose without restriction. + * + * Maintenance of notices + * ---------------------- + * In the interest of clarity regarding the origin and status of this + * SLAC software, this and all the preceding Stanford University notices + * are to remain affixed to any copy or derivative of this software made + * or distributed by the recipient and are to be affixed to any copy of + * software made or distributed by the recipient that contains a copy or + * derivative of this software. + * + * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03 + */ + +#ifdef __ALTIVEC__ + +#include +#include +#include +#include +#include +#include + +#define STATIC static + +#define VEC_ALIGNMENT 16 + +#define NAM "AltiVec Support" +#define ERRID(a,b,c,d) (((a)<<24) | ((b)<<16) | ((c)<<8) | (d)) + +typedef uint32_t _vu32 __attribute__((vector_size(VEC_ALIGNMENT))); + +#ifndef MSR_VE +#define MSR_VE (1<<(31-6)) +#endif + +/* NOTE: These two variables are accessed by assembly code + * which assumes 32-bit data! + */ +uint32_t _CPU_altivec_ctxt_off = 0; +uint32_t _CPU_altivec_psim_cpu = 0; + +static inline uint32_t +mfmsr(void) +{ +uint32_t v; + _CPU_MSR_GET(v); + return v; +} + +static inline void +mtmsr(uint32_t v) +{ + _CPU_MSR_SET(v); +} + +static inline void +isync(void) +{ + asm volatile("isync"); +} + +static inline void +dssall(void) +{ + if ( !_CPU_altivec_psim_cpu) + asm volatile("dssall"); +} + +static inline uint32_t +set_MSR_VE(void) +{ +uint32_t rval; + rval=mfmsr(); + if ( ! (MSR_VE & rval ) ) { + mtmsr(rval | MSR_VE); + isync(); + } + return rval; +} + +static inline void +clr_MSR_VE(void) +{ + dssall(); + mtmsr(mfmsr() & ~MSR_VE); + isync(); +} + +static inline void +rst_MSR_VE(uint32_t old) +{ + if ( ! ( MSR_VE & old ) ) { + dssall(); + mtmsr(old); + isync(); + } +} + + +/* Code to probe the compiler's stack alignment (PowerPC); + * The routine determines at run-time if the compiler generated + * 8 or 16-byte aligned code. + * + * Till Straumann , 2005 + */ + +static void dummy(void) __attribute__((noinline)); +/* add (empty) asm-statement to make sure this isn't optimized away */ +static void dummy(void) { __asm__ volatile(""); } + +static unsigned probe_r1(void) __attribute__((noinline)); +static unsigned probe_r1(void) +{ +unsigned r1; + /* call something to enforce creation of a minimal stack frame; + * (8 bytes: r1 and lr space for 'dummy' callee). If compiled + * with -meabi -mno-altivec gcc allocates 8 bytes, if -mno-eabi + * or -maltivec / -mabi=altivec then gcc allocates 16 bytes + * according to the sysv / altivec ABI specs. + */ + dummy(); + /* return stack pointer */ + asm volatile("mr %0,1":"=r"(r1)); + return r1; +} + +static unsigned +probe_ppc_stack_alignment(void) +{ +unsigned r1; + asm volatile("mr %0,1":"=r"(r1)); + return (r1 - probe_r1()) & ~ 0xf; +} + +STATIC int check_stack_alignment(void) +{ +int rval = 0; + if ( VEC_ALIGNMENT > PPC_STACK_ALIGNMENT ) { + printk(NAM": CPU support has unsufficient stack alignment;\n"); + printk("modify 'cpukit/score/cpu/powerpc/rtems/score/powerpc.h'\n"); + printk("and choose PPC_ABI_SVR4. I'll enable a workaround for now.\n"); + rval |= 1; + } + /* Run-time check; should compile with -mabi=altivec */ + if ( probe_ppc_stack_alignment() < VEC_ALIGNMENT ) { + printk(NAM": run-time stack alignment unsufficient; make sure you compile with -mabi=altivec\n"); + rval |= 2; + } + return rval; +} + + +static uint32_t probe_vrsave(_vu32 *p_v) __attribute__((noinline)); + +/* Check if this code was compiled with -mvrsave=yes or no + * so that we can set the default/init value accordingly. + */ +static uint32_t probe_vrsave(_vu32 *p_v) +{ +_vu32 x; +uint32_t vrsave; + /* Explicitly clobber a volatile vector reg (0) that is + * not used to pass return values. + * If -mvrsave=yes was used this should cause gcc to + * set bit 0 in vrsave. OTOH this bit cannot be set + * because v0 is volatile and not used to pass a value + * to the caller... + */ + asm volatile("vxor %0, 0, 0; mfvrsave %1":"=v"(x),"=r"(vrsave)::"v0"); + if ( p_v ) { + *p_v = x; + } + return vrsave; +} + +static int vrsave_yes(void) __attribute__((noinline)); + +static int vrsave_yes(void) +{ +uint32_t vrsave_pre; + asm volatile("mfvrsave %0":"=r"(vrsave_pre)); + if ( (vrsave_pre & 0x80000000) ) { + printk(NAM": WARNING - unable to determine whether -mvrsave was used; assuming NO\n"); + return 0; + } + return probe_vrsave(0) != vrsave_pre; +} + +extern void +_CPU_altivec_set_vrsave_initval(uint32_t); + + +void +_CPU_Initialize_altivec(void) +{ +unsigned pvr; + + /* I don't like to have to #define the offset of the altivec area + * for use by assembly code. + * Therefore, we compute it here and store it in memory... + */ + _CPU_altivec_ctxt_off = offsetof(ppc_context, altivec); + + /* + * See ppc_get_context() and PPC_CONTEXT_OFFSET_GPR1 + */ + _CPU_altivec_ctxt_off += PPC_DEFAULT_CACHE_LINE_SIZE; + + /* + * Add space possibly needed for alignment + */ + _CPU_altivec_ctxt_off += PPC_CACHE_ALIGNMENT - 1; + + if ( ! vrsave_yes() ) { + /* They seemed to compile with -mvrsave=no. Hence we + * must set VRSAVE so that all registers are saved/restored + * in case this support was not built with IGNORE_VRSAVE. + */ + _CPU_altivec_set_vrsave_initval( -1 ); + } + + if ( check_stack_alignment() & 2 ) + rtems_fatal_error_occurred(ERRID('V','E','C','1')); + + pvr = get_ppc_cpu_type(); + /* psim has altivec but lacks the streaming instructions :-( */ + _CPU_altivec_psim_cpu = (PPC_PSIM == pvr); + + if ( ! ppc_cpu_has_altivec() ) { + printk(NAM": This CPU seems not to have AltiVec\n"); + rtems_panic("Unable to initialize AltiVec Support\n"); + } + + if ( ! (mfmsr() & MSR_VE) ) { + printk(NAM": Warning: BSP should set MSR_VE early; doing it now...\n"); + set_MSR_VE(); + } +} +#endif diff --git a/bsps/powerpc/shared/altivec/vec_sup_asm.S b/bsps/powerpc/shared/altivec/vec_sup_asm.S new file mode 100644 index 0000000000..279d1704a7 --- /dev/null +++ b/bsps/powerpc/shared/altivec/vec_sup_asm.S @@ -0,0 +1,821 @@ +#ifdef __ALTIVEC__ + +/* Altivec support for RTEMS; vector register context management. */ + +/* + * Authorship + * ---------- + * This software was created by + * Till Straumann , 2009, + * Stanford Linear Accelerator Center, Stanford University. + * + * Acknowledgement of sponsorship + * ------------------------------ + * This software was produced by + * the Stanford Linear Accelerator Center, Stanford University, + * under Contract DE-AC03-76SFO0515 with the Department of Energy. + * + * Government disclaimer of liability + * ---------------------------------- + * Neither the United States nor the United States Department of Energy, + * nor any of their employees, makes any warranty, express or implied, or + * assumes any legal liability or responsibility for the accuracy, + * completeness, or usefulness of any data, apparatus, product, or process + * disclosed, or represents that its use would not infringe privately owned + * rights. + * + * Stanford disclaimer of liability + * -------------------------------- + * Stanford University makes no representations or warranties, express or + * implied, nor assumes any liability for the use of this software. + * + * Stanford disclaimer of copyright + * -------------------------------- + * Stanford University, owner of the copyright, hereby disclaims its + * copyright and all other rights in this software. Hence, anyone may + * freely use it for any purpose without restriction. + * + * Maintenance of notices + * ---------------------- + * In the interest of clarity regarding the origin and status of this + * SLAC software, this and all the preceding Stanford University notices + * are to remain affixed to any copy or derivative of this software made + * or distributed by the recipient and are to be affixed to any copy of + * software made or distributed by the recipient that contains a copy or + * derivative of this software. + * + * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03 + */ + + +#include + +#ifndef PPC_CACHE_ALIGNMENT +#error "Missing header; PPC_CACHE_ALIGNMENT is not defined" +#endif + +#define ALTIVEC_TESTING + +#if PPC_CACHE_ALIGNMENT != 32 +#error "Altivec support assumes cache-line size is 32 bytes!" +#else +#undef LD_PPC_CACHE_ALIGNMENT +#define LD_PPC_CACHE_ALIGNMENT 5 +#endif + + .set v0, 0 + .set v8, 8 + .set v16, 16 + .set v20, 20 + .set v24, 24 + .set v28, 28 + + .set r0, 0 + .set r3, 3 + .set r4, 4 + /* Do not use r5, since this is used by _CPU_Context_switch() */ + .set r6, 6 + .set r7, 7 + .set r8, 8 + .set r9, 9 + .set r10, 10 + .set r11, 11 + /* Do not use r12, since this is used by _CPU_Context_switch() */ + + .set cr5, 5 + + .set VECSIZE, 16 + + .set VRSAVE_INIT_VAL, 0 + .set VSCR_INIT_VAL, 0 + + .set VRSAVE_OFF, 16 + .set VSCR_OFF, 16+12 + + .set ds0, 0 + + /* Block size for dst -- in units of 16-bytes */ + .set BSIZE, 2 /* = 32 bytes */ + .set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */ + .set BSTRIDE, 32 /* bytes */ + + .data + + .global _CPU_altivec_vrsave_initval +_CPU_altivec_vrsave_initval: + .long 0 + + .global _CPU_altivec_vscr_initval +_CPU_altivec_vscr_initval: + .long 0 + + .text + + .extern _CPU_altivec_psim_cpu + .extern _CPU_altivec_ctxt_off + + .macro CMPOFF _B0 + lis \_B0, _CPU_altivec_ctxt_off@ha + lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0) + .endm + + /* Conditionally load or store a vector _VR to + * EA(_R1|0 + _R2) + * If bit _VR (corresponding to _VR) is set in CRC + * then the load/store is performed but otherwise + * it is skipped. + * If compiled with IGNORE_VRSAVE defined then + * the load/store is done unconditionally. + * + * _OPCODE: intended to be lvx, lvxl, stvx or stvxl + * _VR : target vector register + * _R1 : base register (NOTE: _R1=r0 uses a + * implicit ZERO constant, not the contents + * of r0) for address computation. + * _R2 : 'offset' register for address computation. + * + * MODIFIES: _VR on output if a load operation is performed. + * IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE + * defined. + */ + .macro LDST _OPCODE, _VR, _R1, _R2 +#ifndef IGNORE_VRSAVE + bc 4, \_VR, 111f +#endif + \_OPCODE \_VR, \_R1, \_R2 +111: + .endm + + /* + * Load or store four 'adjacent' vector registers. + * + * _OPCODE: intended to be lvx, lvxl, stvx or stvxl + * _VR : target vector register + * _R1 : base register (NOTE: _R1=r0 uses a + * implicit ZERO constant, not the contents + * of r0) for address computation. + * _B0 : base register 0 + * _B1 : base register 1 + * _B2 : base register 2 + * _B3 : base register 3 + * _RO : offset register + * + * memory addresses for _VR, _VR+1, _VR+2, _VR+3 + * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively. + * + * MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load + * operation is performed. + * IMPLICIT USE: see LDST + */ + .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO + LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO + LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO + LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO + LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO + .endm + + /* + * Preload/zero two cache lines and save 4 vector registers + * to memory. + * Note that the cache operation targets memory *past* the + * current storage area which should hopefully hit when + * This same code is executed on the next two cache lines... + * + * This code effectively does + * dcbz (_B0 + 64) + * dcbz (_B0 + 64 + 32) + * stvx _VF+0, (_B0+ 0) + * stvx _VF+1, (_B0+16) + * stvx _VF+2, (_B0+32) + * stvx _VF+3, (_B0+48) + * + * _LRU: may be 'l' or empty. The former variant should be + * used when it is conceivable that the memory area is + * unlikely to be used in the near future thus making + * it a candidate for early eviction from the caches. + * + * If it is likely that the memory area is reused soon + * (e.g., save/restore across ISR execution) then the + * 'stvx' opcode (w/o 'l' suffix) should be used. + * + * _VR: first of four target vector registers; _VR+0, + * _VR+1, _VR+2, _VR+3 are saved. + * + * _BO: base address of memory area. + * _B1: should contain _B0+16 on entry + * _B2: should contain _B0+32 on entry + * _B3: should contain _B0+48 on entry + * + * _O1: contains the offset where the four vectors are + * stored. + * _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 ) + * _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 ) + * _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 ) + * _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 ) + * _O2: is set to _O1 + 64 by this macro. Hence _O2 is + * used to address the two cache-lines past the + * current memory area. + * + * MODIFIES: _O2; contains _O1 + 64 after execution of this + * code. + * + * NOTES: a different set of four vectors can be addressed + * simply by changing the one offset register _O1. + * + * Saving more than 4 registers can simply be + * achieved by expanding this macro multiple + * times with _O1 and _O2 swapped (new _O1 + * becomes _O2 = old _O1 + 64) thus stepping + * through the memory area. + * + */ + .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT + dcbz \_B0, \_O2 + dcbz \_B2, \_O2 + LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 + .endm + + /* + * Save eight vector registers by expanding S4VEC_P twice. + * See notes for S4VEC_P above. + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * + * MODIFIES: After execution, + * _O2 contains original _O1 + 64, + * _O1 contains original _O1 + 128 + * + * NOTES: Expanding this macro multiple times lets you save + * multiple blocks of 8 registers (no reload of _Bx / _Ox is needed). + */ + .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + /* Note that the roles of _O1 and _O2 are swapped here */ + S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1 + .endm + + /* + * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1) + * + * See notes above (for S4VEC_P). + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * MODIFIES: _O1 contains original _O1 + 256 + * _O2 contains original _O1 + 256 - 64 + */ + .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 + S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 + .endm + + /* + * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1) + * + * See notes above (for S4VEC_P, S_V0TOV19). + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * MODIFIES: _O1 contains original _O1 + 128 + * _O2 contains original _O1 + 128 - 64 + */ + .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 + S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 + LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1 + .endm + + /* + * Save all registers to memory area + * + * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) + * MODIFIES: _O1 contains original _O1 + 512 + * _O2 contains original _O1 + 512 - 64 + */ + .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 + S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 + LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2 + .endm + + + /* + * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively. + * We can pass either of them as arguments to another macro which + * allows us to decide if the main macro uses dcbt or not when + * we expand it... + */ + .macro DO_DCBT _RA, _RB + dcbt \_RA, \_RB + .endm + + .macro NO_DCBT _RA, _RB + .endm + + /* + * NOTE REGARDING dcbt VS dst + * + * Preloading the cache with memory areas that we soon need + * can be done either using 'dcbt' or 'dst' instructions + * "ahead of time". + * When experimenting (on a mpc7457) I found that the 'dst' + * stream instruction was very efficient if there is enough + * time to read ahead. It works well when we do a context + * switch: + * + * 1) start DST on new context to be loaded + * 2) save old context to memory + * 3) load new context from memory + * + * Because of the interleaved step 2) dst works nicely and + * 3) finds what it needs in the cache. + * + * However, in a situation when there is not much time + * to start the DST, e.g., because we want to restore + * a context out of the blue (e.g., after returning + * from and ISR): + * + * 1) save volatile registers to memory/stack + * 2) execute ISR + * 3) might do a task context switch + * 4) when returned to old task context then + * reload volatile registers from memory/stack. + * + * In this situation, preloading the target memory before + * or after step 1) makes obviously no sense because after + * 1) the registers area is most likely in the cache already. + * + * Starting preload after 2) doesn't make much sense either. + * If ISR doesn't lead to a context switch then it is quite + * likely that the register area is still in the cache. + * OTOTH, if a context switch happens then the preload after 2) + * might be useless. + * + * This leaves us at step 4) where we want to load immediately. + * In this case, I found that 'dcbt' works more efficiently + * so that's what we use when restoring volatile registers. + * + * When restoring the non-volatile VRs during a 'normal' + * context switch then we shall use DST (and no dcbt). + */ + + /* + * Symmetric to S4VEC_P above but addresses loading four + * vector registers from memory. + * + * Touches two cache lines past the current memory area + * and loads four vectors from the current area. + * + * Optionally, the DCBT operation may be omitted + * (when expanding with _DCBT=NO_DCBT). + * This is useful if the cache was already preloaded + * by another means (dst instruction). + * + * NOTE: We always use the 'LRU' form of lvx: lvxl, + * because we deem it unlikely that the context + * that was just loaded has to be saved again + * to memory in the immediate future. + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O2 contains original _O1 + 64. + * _VR.._VR+3 loaded from memory. + */ + .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT + /* preload/touch 2 lines at offset 64 from _B0 */ + \_DCBT \_B0, \_O2 + \_DCBT \_B2, \_O2 + /* load four vectors at off set 0 from _B0 */ + LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1 + .endm + + /* + * Symmetric to S8VEC_P; loads 8 vector registers + * from memory -- see comments above... + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O1 contains original _O1 + 128. + * _O2 contains original _O1 + 64. + * _VR.._VR+7 loaded from memory. + */ + .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 + L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1 + .endm + + /* + * Load volatile vector registers v0..v19 employing + * the DCBT to preload the cache. The rationale for + * using DCBT here but not when restoring non-volatile + * registers is explained above, see + * + * "NOTE REGARDING dcbt VS dst" + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O1 contains original _O1 + 256. + * _O2 contains original _O1 + 256 - 64. + * VR0..VR19 loaded from memory. + */ + .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2 + L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1 + .endm + + /* + * Load non-volatile vector registers v20..v31. + * Note that no DCBT is performed since we use + * DST for preloading the cache during a context + * switch, see + * + * "NOTE REGARDING dcbt VS dst" + * + * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded + * as explained above. + * + * MODIFIES: _O1 contains original _O1 + 128. + * _O2 contains original _O1 + 128 - 64. + * VR20..VR31 loaded from memory. + */ + .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2 + L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1 + .endm + + /* + * Load all registers from memory area. + */ + .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 + L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 + LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2 + .endm + + /* + * Compute + * _B1 = _B0 + 16 + * _B2 = _B0 + 32 + * _B3 = _B0 + 48 + * and load + * _RO = 0 + * + * convenience macro to be expanded before + * any of the load/store macros that use + * four base addresses etc. + * + * INPUT: _B0 = cache-aligned start of memory area + * + * MODIFIES: _B1, _B2, _B3, _RO as described above. + */ + .macro CMP_BASES _B0, _B1, _B2, _B3, _RO + addi \_B1, \_B0, 1*VECSIZE + addi \_B2, \_B0, 2*VECSIZE + addi \_B3, \_B0, 3*VECSIZE + li \_RO, 0 + .endm + + /* + * Prepare for saving general vector registers. + * + * If not built with #define IGNORE_VRSAVE then + * + * 1) copy vrsave to CRC + * + * endif + * + * 2) copy vrsave to _VRSAVE_REG + * 3) preload/zero cache line where vrsave and vscr are stored. + * 4) compute base adresses from _B0 + * 5) preload/zero first two cache lines (remember that the + * first S8VEC_P starts preloading/zeroing at offset 64). + * + * INPUT: 'vrsave' register, _B0 (base address of memory area) + * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave') + * _B0 = original _BO + 32 + * _B1 = original _B0 + 32 + 16, + * _B2 = original _B0 + 32 + 32, + * _B3 = original _B0 + 32 + 48, + * CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined) + */ + .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO + mfvrsave \_VRSAVE_REG +#ifndef IGNORE_VRSAVE + mtcr \_VRSAVE_REG +#endif + dcbz 0, \_B0 + addi \_B0, \_B0, PPC_CACHE_ALIGNMENT + dcbz 0, \_B0 + CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO + dcbz 0, \_B2 + .endm + + /* + * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers + * must have been loaded from 'vrsave' and 'vscr', respectively, + * prior to expanding this macro. + * + * INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents + * _VSCR_VREG VR holding 'vscr' contents + * _B0 cache-aligned (base) address of memory area. + * MODIFIES: _SCRATCH_REG + */ + .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG + stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) + li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF + stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG + .endm + + /* + * Load 'vrsave' and 'vscr' from memory. + * + * INPUTS: _B0 cache-aligned (base) address of memory area. + * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr) + * 'vscr', 'vrsave'. + * CRC (holds contents of 'vrsave') (ONLY IF COMPILED + * with IGNORE_VRSAVE undefined). + */ + .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG + lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) + mtvrsave \_SCRATCH_REG +#ifndef IGNORE_VRSAVE + mtcr \_SCRATCH_REG +#endif + li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF + lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG + mtvscr \_SCRATCH_VREG + .endm + + /* + * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1) + * + * INPUT: _B0 + * MODIFIES: _B0 (as stated above) + */ + .macro CACHE_DOWNALGN _B0 + rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT + .endm + + .text + + .global _CPU_save_altivec_volatile +_CPU_save_altivec_volatile: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 + +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r9 +#endif + + PREP_FOR_SAVE r0, r3, r4, r8, r6, r10 + /* r0 now contains VRSAVE, r3 still the aligned memory area + * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3, + * respectively. r10 holds zero + */ + S_V0TOV19 _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11 + mfvscr v0 + /* Store vrsave (still in r0) and vscr (in v0) to memory area */ + S_VSCR_VRSAVE r0, v0, r3, r11 + +#ifndef IGNORE_VRSAVE + /* Restore CRC */ + mtcr r9 +#endif + blr + + .global _CPU_load_altivec_volatile +_CPU_load_altivec_volatile: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r9 +#endif + + /* Try to preload 1st line (where vscr and vrsave are stored) */ + dcbt 0, r3 + /* Point to start of general vector-register area */ + addi r3, r3, PPC_CACHE_ALIGNMENT + /* Start preloading 2nd line (where first two vectors are) */ + dcbt 0, r3 + L_VSCR_VRSAVE r3, r0, v0 + CMP_BASES r3, r4, r8, r6, r10 + /* Start preloading 3rd line (where vectors 3 and 4 are) */ + dcbt 0, r8 + L_V0TOV19 r3, r4, r8, r6, r10, r11 + +#ifndef IGNORE_VRSAVE + mtcr r9 +#endif + blr + + .global _CPU_Context_switch_altivec +_CPU_Context_switch_altivec: + + /* fetch offset of altivec area in context */ + CMPOFF r8 + /* down-align 'to' area to cache-line boundary */ + add r4, r4, r8 + CACHE_DOWNALGN r4 + + /* Check for PSIM */ + lis r6, _CPU_altivec_psim_cpu@ha + lwz r6, _CPU_altivec_psim_cpu@l(r6) + cmpli 0, r6, 0 + bne 1f + /* Skip data-stream instructions on PSIM (not implemented) */ + dssall + /* Pre-load new context into cache */ + lis r6, (BSIZE<<(24-16)) | (BCNT<<(16-16)) + ori r6, r6, BSTRIDE + dstt r4, r6, ds0 +1: + +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r9 +#endif + + /* Is 'from' context == NULL ? (then we just do a 'restore') */ + cmpli 0, r3, 0 + beq 1f /* yes: skip saving 'from' context */ + + /* SAVE NON-VOLATILE REGISTERS */ + + /* Compute aligned destination pointer (r8 still holds offset + * to 'altivec' area in context) + */ + add r3, r3, r8 + CACHE_DOWNALGN r3 + + PREP_FOR_SAVE r0, r3, r8, r6, r7, r10 + /* The manual says reading vscr can take some time - do + * read it here (into a volatile vector register) while + * we wait for cache blocks to be allocated + */ + mfvscr v0 + S_V20TOV31 _LRU=l, _B0=r3, _B1=r8, _B2=r6, _B3=r7, _O1=r10, _O2=r11 + /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */ + S_VSCR_VRSAVE r0, v0, r3, r8 + +1: + + /* LOAD NON-VOLATILE REGISTERS */ + + /* Advance past vrsave/vscr area */ + addi r4, r4, PPC_CACHE_ALIGNMENT + L_VSCR_VRSAVE r4, r0, v0 + CMP_BASES r4, r8, r6, r7, r10 + L_V20TOV31 r4, r8, r6, r7, r10, r11 + +#ifndef IGNORE_VRSAVE + mtcr r9 +#endif + blr + + .global _CPU_Context_initialize_altivec +_CPU_Context_initialize_altivec: + CMPOFF r8 + add r3, r3, r8 + CACHE_DOWNALGN r3 + lis r8, _CPU_altivec_vrsave_initval@ha + lwz r8, _CPU_altivec_vrsave_initval@l(r8) + stw r8, VRSAVE_OFF(r3) + lis r6, _CPU_altivec_vscr_initval@ha + lwz r6, _CPU_altivec_vscr_initval@l(r6) + stw r6, VSCR_OFF(r3) + blr + + /* + * Change the initial value of VRSAVE. + * Can be used by initialization code if + * it is determined that code was compiled + * with -mvrsave=no. In this case, VRSAVE + * must be set to all-ones which causes this + * support code to save/restore *all* registers + * (only has an effect if IGNORE_VRSAVE is + * not defined -- otherwise all registers are + * saved/restored anyways). + */ + .global _CPU_altivec_set_vrsave_initval +_CPU_altivec_set_vrsave_initval: + lis r8, _CPU_altivec_vrsave_initval@ha + stw r3, _CPU_altivec_vrsave_initval@l(r8) + mtvrsave r3 + blr + +#ifdef ALTIVEC_TESTING + .global msr_VE_on +msr_VE_on: + mfmsr r3 + oris r3, r3, 1<<(31-6-16) + mtmsr r3 + blr + + .global msr_VE_off +msr_VE_off: + mfmsr r3 + lis r4, 1<<(31-6-16) + andc r3, r3, r4 + mtmsr r3 + blr + + + .global mfvrsave +mfvrsave: + mfvrsave r3 + blr + + .global mtvrsave +mtvrsave: + mtvrsave r3 + blr + + /* Load all vector registers from memory area. + * NOTE: This routine is not strictly ABI compliant -- + * it guarantees that volatile vector registers + * have certain values on exit! + */ + .global _CPU_altivec_load_all +_CPU_altivec_load_all: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r9 +#endif + + /* Try to preload 1st line (where vscr and vrsave are stored) */ + dcbt 0, r3 + /* Point to start of general vector-register area */ + addi r3, r3, PPC_CACHE_ALIGNMENT + /* Start preloading 2nd line (where first two vectors are) */ + dcbt 0, r3 + L_VSCR_VRSAVE r3, r0, v0 + CMP_BASES r3, r4, r8, r6, r10 + /* Start preloading 3rd line (where vectors 3 and 4 are) */ + dcbt 0, r8 + L_V0TOV31 r3, r4, r8, r6, r10, r11 + +#ifndef IGNORE_VRSAVE + mtcr r9 +#endif + blr + + .global _CPU_altivec_save_all +_CPU_altivec_save_all: + /* Align address up to next cache-line boundary */ + addi r3, r3, PPC_CACHE_ALIGNMENT - 1 + CACHE_DOWNALGN r3 + +#ifndef IGNORE_VRSAVE + /* Save CRC -- it is used implicitly by all the LOAD/STORE macros + * when testing if we really should do the load/store operation. + */ + mfcr r9 +#endif + + PREP_FOR_SAVE r0, r3, r4, r8, r6, r10 + /* r0 now contains VRSAVE, r3 still the aligned memory area + * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3, + * respectively. r10 holds zero + */ + S_V0TOV31 _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11 + mfvscr v0 + /* Store vrsave (still in r0) and vscr (in v0) to memory area */ + S_VSCR_VRSAVE r0, v0, r3, r11 + +#ifndef IGNORE_VRSAVE + /* Restore CRC */ + mtcr r9 +#endif + blr + + +#if 0 + .gnu_attribute 4,1 + .gnu_attribute 8,1 +#endif + +#endif +#endif diff --git a/c/src/lib/libbsp/powerpc/beatnik/Makefile.am b/c/src/lib/libbsp/powerpc/beatnik/Makefile.am index 08bec33bb7..fa29c8c9ac 100644 --- a/c/src/lib/libbsp/powerpc/beatnik/Makefile.am +++ b/c/src/lib/libbsp/powerpc/beatnik/Makefile.am @@ -158,11 +158,16 @@ network_if_em.rel: network_if_em_tmp.rel $(OBJCOPY) -G rtems_em_attach -G net_driver_ticks_per_sec \ -G rtems_em_pci_setup -G rtems_em_early_link_check_ops \ $^ $@ + +libbsp_a_LIBADD = network_support.rel \ + network_if_mve.rel network_if_gfe.rel network_if_em.rel endif # tod libbsp_a_SOURCES += ../../shared/tod.c tod/todcfg.c +libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup.c +libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup_asm.S libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/cache/cache.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/dev/clock-ppc-dec.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/bat.c @@ -171,13 +176,6 @@ libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/pte121.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/irq/ppc-irq-legacy.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/ppc-dec-timer.c -libbsp_a_LIBADD = ../../../libcpu/@RTEMS_CPU@/mpc6xx/altivec.rel - -if HAS_NETWORKING -libbsp_a_LIBADD += network_support.rel \ - network_if_mve.rel network_if_gfe.rel network_if_em.rel -endif - EXTRA_DIST += README LICENSE include $(top_srcdir)/../../../../automake/local.am diff --git a/c/src/lib/libbsp/powerpc/motorola_powerpc/Makefile.am b/c/src/lib/libbsp/powerpc/motorola_powerpc/Makefile.am index aa27a39c72..e3be115a23 100644 --- a/c/src/lib/libbsp/powerpc/motorola_powerpc/Makefile.am +++ b/c/src/lib/libbsp/powerpc/motorola_powerpc/Makefile.am @@ -114,6 +114,8 @@ libbsp_a_SOURCES += ../../i386/pc386/ne2000/ne2000.c endif endif +libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup.c +libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup_asm.S libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/cache/cache.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/dev/clock-ppc-dec.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/bat.c @@ -122,9 +124,7 @@ libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/pte121.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/irq/ppc-irq-legacy.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/ppc-dec-timer.c -libbsp_a_LIBADD = \ - polledIO.rel \ - ../../../libcpu/@RTEMS_CPU@/mpc6xx/altivec.rel +libbsp_a_LIBADD = polledIO.rel EXTRA_DIST += BOOTING README.mtx603e README.MVME2100 README.MVME2300 \ README.MVME2400 \ diff --git a/c/src/lib/libbsp/powerpc/mvme5500/Makefile.am b/c/src/lib/libbsp/powerpc/mvme5500/Makefile.am index 1ad78e0776..450153a739 100644 --- a/c/src/lib/libbsp/powerpc/mvme5500/Makefile.am +++ b/c/src/lib/libbsp/powerpc/mvme5500/Makefile.am @@ -66,6 +66,7 @@ network_rel_SOURCES = network/if_100MHz/GT64260eth.c \ network/if_1GHz/if_wm.c network/if_1GHz/pci_map.c network_rel_CPPFLAGS = $(AM_CPPFLAGS) $(network_CPPFLAGS) network_rel_LDFLAGS = $(RTEMS_RELLDFLAGS) +libbsp_a_LIBADD = network.rel endif EXTRA_DIST += ../../powerpc/shared/start/rtems_crti.S @@ -83,6 +84,8 @@ project_lib_DATA += mvme5500start.$(OBJEXT) project_lib_DATA += linkcmds dist_project_lib_DATA += ../shared/startup/linkcmds.share +libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup.c +libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup_asm.S libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/cache/cache.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/dev/clock-ppc-dec.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/bat.c @@ -91,13 +94,6 @@ libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/pte121.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/irq/ppc-irq-legacy.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/ppc-dec-timer.c -libbsp_a_LIBADD = \ - ../../../libcpu/@RTEMS_CPU@/mpc6xx/altivec.rel - -if HAS_NETWORKING -libbsp_a_LIBADD += network.rel -endif - include $(top_srcdir)/../../../../automake/local.am include $(srcdir)/../../../../../../bsps/powerpc/shared/shared.am include $(srcdir)/../../../../../../bsps/powerpc/shared/exceptions.am diff --git a/c/src/lib/libbsp/powerpc/psim/Makefile.am b/c/src/lib/libbsp/powerpc/psim/Makefile.am index 2192daba95..49b809cd1b 100644 --- a/c/src/lib/libbsp/powerpc/psim/Makefile.am +++ b/c/src/lib/libbsp/powerpc/psim/Makefile.am @@ -60,6 +60,8 @@ if HAS_NETWORKING libbsp_a_SOURCES += network/if_sim.c endif +libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup.c +libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup_asm.S libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/cache/cache.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/dev/clock-ppc-dec.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/bat.c @@ -68,8 +70,6 @@ libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/pte121.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/irq/ppc-irq-legacy.c libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/ppc-dec-timer.c -libbsp_a_LIBADD = ../../../libcpu/@RTEMS_CPU@/mpc6xx/altivec.rel - include $(top_srcdir)/../../../../automake/local.am include $(srcdir)/../../../../../../bsps/powerpc/shared/shared.am include $(srcdir)/../../../../../../bsps/powerpc/shared/exceptions.am diff --git a/c/src/lib/libcpu/powerpc/Makefile.am b/c/src/lib/libcpu/powerpc/Makefile.am index 1c15f2cc59..9d19f0e13d 100644 --- a/c/src/lib/libcpu/powerpc/Makefile.am +++ b/c/src/lib/libcpu/powerpc/Makefile.am @@ -39,13 +39,4 @@ if ppc405 ## ppc4xx/include endif # ppc405 -if mpc6xx -# mpc6xx/altivec -noinst_PROGRAMS += mpc6xx/altivec.rel -mpc6xx_altivec_rel_SOURCES = mpc6xx/altivec/vec_sup.c mpc6xx/altivec/vec_sup_asm.S -mpc6xx_altivec_rel_CPPFLAGS = $(AM_CPPFLAGS) -mpc6xx_altivec_rel_LDFLAGS = $(RTEMS_RELLDFLAGS) -endif -EXTRA_DIST += mpc6xx/altivec/README - include $(top_srcdir)/../../../automake/local.am diff --git a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/README b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/README deleted file mode 100644 index 61ebb8dded..0000000000 --- a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/README +++ /dev/null @@ -1,184 +0,0 @@ -RTEMS ALTIVEC SUPPORT -===================== - -1. History ----------- - -Altivec support was developed and maintained as a user-extension -outside of RTEMS. This extension is still available (unbundled) -from Till Straumann ; it is useful -if an application desires 'lazy switching' of the altivec context. - -2. Modes --------- - -Altivec support -- the unbundled extension, that is -- can be used -in two ways: - -a. All tasks are implicitly AltiVec-enabled. - -b. Only designated tasks are AltiVec-enabled. 'Lazy-context switching' - is implemented to switch AltiVec the context. - -Note that the code implemented in this directory supports mode 'a' -and mode 'a' ONLY. For mode 'b' you need the unbundled extension -(which is completely independent of this code). - -Mode 'a' (All tasks are AltiVec-enabled) -- - - - - - - - - - - - - - - - - - - - - - -The major disadvantage of this mode is that additional overhead is -involved: tasks that never use the vector unit still save/restore -the volatile vector registers (20 registers * 16bytes each) across -every interrupt and all non-volatile registers (12 registers * 16b each) -during every context switch. - -However, saving/restoring e.g., the volatile registers is quite -fast -- on my 1GHz 7457 saving or restoring 20 vector registers -takes only about 1us or even less (if there are cache hits). - -The advantage is complete transparency to the user and full ABI -compatibility (exept for ISRs and exception handlers), see below. - -Mode 'b' (Only dedicated tasks are AltiVec-enabled) -- - - - - - - - - - - - - - - - - - - - - - - - - - - -The advantage of this mode of operation is that the vector-registers -are only saved/restored when a different, altivec-enabled task becomes -ready to run. In particular, if there is only a single altivec-enabled -task then the altivec-context *never* is switched. - -Note that this mode of operation is not supported by the code -in this directory -- you need the unbundled altivec extension -mentioned above. - -3. Compiler Options -------------------- - -Three compiler options affect AltiVec: -maltivec, -mabi=altivec and --mvrsave=yes/no. - --maltivec: This lets the cpp define the symbol __ALTIVEC__ and enables - gcc to emit vector instructions. Note that gcc may use the - AltiVec engine implicitly, i.e., **without you writing any - vectorized code**. - --mabi=altivec: This option has two effects: - i) It ensures 16-byte stack alignment required by AltiVec - (even in combination with eabi which is RTEMS' default). - ii) It allows vector arguments to be passed in vector registers. - --mvrsave=yes/no: Instructs gcc to emit code which sets the VRSAVE register - indicating which vector registers are 'currently in use'. - Because the altivec support does not use this information *) the - option has no direct affect but it is desirable to compile with - -mvrsave=no so that no unnecessary code is generated. - - *) The file vec_sup_asm.S conditionally disables usage of - the VRSAVE information if the preprocessor symbol - 'IGNORE_VRSAVE' is defined, which is the default. - - If 'IGNORE_VRSAVE' is undefined then the code *does* - use the VRSAVE information but I found that this does - not execute noticeably faster. - -IMPORTANT NOTES -=============== - -AFAIK, RTEMS uses the EABI which requires a stack alignment of only 8 bytes -which is NOT enough for AltiVec (which requires 16-byte alignment). - -There are two ways for obtaining 16-byte alignment: - -I) Compile with -mno-eabi (ordinary SYSV ABI has 16-byte alignment) -II) Compile with -mabi=altivec (extension to EABI; maintains 16-byte alignment - but also allows for passing vector arguments in vector registers) - -Note that it is crucial to compile ***absolutely everything*** with the same -ABI options (or a linker error may occur). In particular, this includes - - - newlibc multilib variant - - RTEMS proper - - application + third-party code - -IMO the proper compiler options for Mode 'a' would be - - -maltivec -mabi=altivec -mvrsave=no - -Note that the -mcpu=7400 option also enables -maltivec and -mabi=altivec -but leaves -mvrsave at some 'default' which is probably 'no'. -Compiling with -mvrsave=yes does not produce incompatible code but -may have a performance impact (since extra code is produced to maintain -VRSAVE). - -4. Multilib Variants --------------------- - -The default GCC configuration for RTEMS contains a -mcpu=7400 multilib -variant which is the correct one to choose. - -5. BSP 'custom' file. ---------------------- - -Now that you have the necessary newlib and libgcc etc. variants -you also need to build RTEMS accordingly. - -In you BSP's make/custom/.cfg file make sure the CPU_CFLAGS -select the desired variant: - -for mode 'a': - - CPU_CFLAGS = ... -mcpu=7400 - -Note that since -maltivec globally defines __ALTIVEC__ RTEMS automatially -enables code that takes care of switching the AltiVec context as necessary. -This is transparent to application code. - -6. BSP support --------------- - -It is the BSP's responsibility to initialize MSR_VE, VSCR and VRSAVE -during early boot, ideally before any C-code is executed (because it -may, theoretically, use vector instructions). - -The BSP must - - - set MSR_VE - - clear VRSAVE; note that the probing algorithm for detecting - whether -mvrsave=yes or 'no' was used relies on the BSP - clearing VRSAVE during early start. Since no interrupts or - context switches happen before the AltiVec support is initialized - clearing VRSAVE is no problem even if it turns out that -mvrsave=no - was in effect (eventually a value of all-ones will be stored - in VRSAVE in this case). - - clear VSCR - -7. PSIM note ------------- - -PSIM supports the AltiVec instruction set with the exception of -the 'data stream' instructions for cache prefetching. The RTEMS -altivec support includes run-time checks to skip these instruction -when executing on PSIM. - -Note that AltiVec support within PSIM must be enabled at 'configure' -time by passing the 'configure' option - ---enable-sim-float=altivec - -Note also that PSIM's AltiVec support has many bugs. It is recommended -to apply the patches filed as an attachment with gdb bug report #2461 -prior to building PSIM. - -The CPU type and corresponding multilib must be changed when -building RTEMS/psim: - - edit make/custom/psim.cfg and change - - CPU_CFLAGS = ... -mcpu=603e - - to - - CPU_CFLAGS = ... -mcpu=7400 - -This change must be performed *before* configuring RTEMS/psim. diff --git a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c deleted file mode 100644 index 141779c175..0000000000 --- a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c +++ /dev/null @@ -1,273 +0,0 @@ -/* Altivec support for RTEMS; vector register context management. */ - -/* - * Authorship - * ---------- - * This software was created by - * Till Straumann , 2009, - * Stanford Linear Accelerator Center, Stanford University. - * - * Acknowledgement of sponsorship - * ------------------------------ - * This software was produced by - * the Stanford Linear Accelerator Center, Stanford University, - * under Contract DE-AC03-76SFO0515 with the Department of Energy. - * - * Government disclaimer of liability - * ---------------------------------- - * Neither the United States nor the United States Department of Energy, - * nor any of their employees, makes any warranty, express or implied, or - * assumes any legal liability or responsibility for the accuracy, - * completeness, or usefulness of any data, apparatus, product, or process - * disclosed, or represents that its use would not infringe privately owned - * rights. - * - * Stanford disclaimer of liability - * -------------------------------- - * Stanford University makes no representations or warranties, express or - * implied, nor assumes any liability for the use of this software. - * - * Stanford disclaimer of copyright - * -------------------------------- - * Stanford University, owner of the copyright, hereby disclaims its - * copyright and all other rights in this software. Hence, anyone may - * freely use it for any purpose without restriction. - * - * Maintenance of notices - * ---------------------- - * In the interest of clarity regarding the origin and status of this - * SLAC software, this and all the preceding Stanford University notices - * are to remain affixed to any copy or derivative of this software made - * or distributed by the recipient and are to be affixed to any copy of - * software made or distributed by the recipient that contains a copy or - * derivative of this software. - * - * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03 - */ - -#ifdef __ALTIVEC__ - -#include -#include -#include -#include -#include -#include - -#define STATIC static - -#define VEC_ALIGNMENT 16 - -#define NAM "AltiVec Support" -#define ERRID(a,b,c,d) (((a)<<24) | ((b)<<16) | ((c)<<8) | (d)) - -typedef uint32_t _vu32 __attribute__((vector_size(VEC_ALIGNMENT))); - -#ifndef MSR_VE -#define MSR_VE (1<<(31-6)) -#endif - -/* NOTE: These two variables are accessed by assembly code - * which assumes 32-bit data! - */ -uint32_t _CPU_altivec_ctxt_off = 0; -uint32_t _CPU_altivec_psim_cpu = 0; - -static inline uint32_t -mfmsr(void) -{ -uint32_t v; - _CPU_MSR_GET(v); - return v; -} - -static inline void -mtmsr(uint32_t v) -{ - _CPU_MSR_SET(v); -} - -static inline void -isync(void) -{ - asm volatile("isync"); -} - -static inline void -dssall(void) -{ - if ( !_CPU_altivec_psim_cpu) - asm volatile("dssall"); -} - -static inline uint32_t -set_MSR_VE(void) -{ -uint32_t rval; - rval=mfmsr(); - if ( ! (MSR_VE & rval ) ) { - mtmsr(rval | MSR_VE); - isync(); - } - return rval; -} - -static inline void -clr_MSR_VE(void) -{ - dssall(); - mtmsr(mfmsr() & ~MSR_VE); - isync(); -} - -static inline void -rst_MSR_VE(uint32_t old) -{ - if ( ! ( MSR_VE & old ) ) { - dssall(); - mtmsr(old); - isync(); - } -} - - -/* Code to probe the compiler's stack alignment (PowerPC); - * The routine determines at run-time if the compiler generated - * 8 or 16-byte aligned code. - * - * Till Straumann , 2005 - */ - -static void dummy(void) __attribute__((noinline)); -/* add (empty) asm-statement to make sure this isn't optimized away */ -static void dummy(void) { __asm__ volatile(""); } - -static unsigned probe_r1(void) __attribute__((noinline)); -static unsigned probe_r1(void) -{ -unsigned r1; - /* call something to enforce creation of a minimal stack frame; - * (8 bytes: r1 and lr space for 'dummy' callee). If compiled - * with -meabi -mno-altivec gcc allocates 8 bytes, if -mno-eabi - * or -maltivec / -mabi=altivec then gcc allocates 16 bytes - * according to the sysv / altivec ABI specs. - */ - dummy(); - /* return stack pointer */ - asm volatile("mr %0,1":"=r"(r1)); - return r1; -} - -static unsigned -probe_ppc_stack_alignment(void) -{ -unsigned r1; - asm volatile("mr %0,1":"=r"(r1)); - return (r1 - probe_r1()) & ~ 0xf; -} - -STATIC int check_stack_alignment(void) -{ -int rval = 0; - if ( VEC_ALIGNMENT > PPC_STACK_ALIGNMENT ) { - printk(NAM": CPU support has unsufficient stack alignment;\n"); - printk("modify 'cpukit/score/cpu/powerpc/rtems/score/powerpc.h'\n"); - printk("and choose PPC_ABI_SVR4. I'll enable a workaround for now.\n"); - rval |= 1; - } - /* Run-time check; should compile with -mabi=altivec */ - if ( probe_ppc_stack_alignment() < VEC_ALIGNMENT ) { - printk(NAM": run-time stack alignment unsufficient; make sure you compile with -mabi=altivec\n"); - rval |= 2; - } - return rval; -} - - -static uint32_t probe_vrsave(_vu32 *p_v) __attribute__((noinline)); - -/* Check if this code was compiled with -mvrsave=yes or no - * so that we can set the default/init value accordingly. - */ -static uint32_t probe_vrsave(_vu32 *p_v) -{ -_vu32 x; -uint32_t vrsave; - /* Explicitly clobber a volatile vector reg (0) that is - * not used to pass return values. - * If -mvrsave=yes was used this should cause gcc to - * set bit 0 in vrsave. OTOH this bit cannot be set - * because v0 is volatile and not used to pass a value - * to the caller... - */ - asm volatile("vxor %0, 0, 0; mfvrsave %1":"=v"(x),"=r"(vrsave)::"v0"); - if ( p_v ) { - *p_v = x; - } - return vrsave; -} - -static int vrsave_yes(void) __attribute__((noinline)); - -static int vrsave_yes(void) -{ -uint32_t vrsave_pre; - asm volatile("mfvrsave %0":"=r"(vrsave_pre)); - if ( (vrsave_pre & 0x80000000) ) { - printk(NAM": WARNING - unable to determine whether -mvrsave was used; assuming NO\n"); - return 0; - } - return probe_vrsave(0) != vrsave_pre; -} - -extern void -_CPU_altivec_set_vrsave_initval(uint32_t); - - -void -_CPU_Initialize_altivec(void) -{ -unsigned pvr; - - /* I don't like to have to #define the offset of the altivec area - * for use by assembly code. - * Therefore, we compute it here and store it in memory... - */ - _CPU_altivec_ctxt_off = offsetof(ppc_context, altivec); - - /* - * See ppc_get_context() and PPC_CONTEXT_OFFSET_GPR1 - */ - _CPU_altivec_ctxt_off += PPC_DEFAULT_CACHE_LINE_SIZE; - - /* - * Add space possibly needed for alignment - */ - _CPU_altivec_ctxt_off += PPC_CACHE_ALIGNMENT - 1; - - if ( ! vrsave_yes() ) { - /* They seemed to compile with -mvrsave=no. Hence we - * must set VRSAVE so that all registers are saved/restored - * in case this support was not built with IGNORE_VRSAVE. - */ - _CPU_altivec_set_vrsave_initval( -1 ); - } - - if ( check_stack_alignment() & 2 ) - rtems_fatal_error_occurred(ERRID('V','E','C','1')); - - pvr = get_ppc_cpu_type(); - /* psim has altivec but lacks the streaming instructions :-( */ - _CPU_altivec_psim_cpu = (PPC_PSIM == pvr); - - if ( ! ppc_cpu_has_altivec() ) { - printk(NAM": This CPU seems not to have AltiVec\n"); - rtems_panic("Unable to initialize AltiVec Support\n"); - } - - if ( ! (mfmsr() & MSR_VE) ) { - printk(NAM": Warning: BSP should set MSR_VE early; doing it now...\n"); - set_MSR_VE(); - } -} -#endif diff --git a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S deleted file mode 100644 index 279d1704a7..0000000000 --- a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S +++ /dev/null @@ -1,821 +0,0 @@ -#ifdef __ALTIVEC__ - -/* Altivec support for RTEMS; vector register context management. */ - -/* - * Authorship - * ---------- - * This software was created by - * Till Straumann , 2009, - * Stanford Linear Accelerator Center, Stanford University. - * - * Acknowledgement of sponsorship - * ------------------------------ - * This software was produced by - * the Stanford Linear Accelerator Center, Stanford University, - * under Contract DE-AC03-76SFO0515 with the Department of Energy. - * - * Government disclaimer of liability - * ---------------------------------- - * Neither the United States nor the United States Department of Energy, - * nor any of their employees, makes any warranty, express or implied, or - * assumes any legal liability or responsibility for the accuracy, - * completeness, or usefulness of any data, apparatus, product, or process - * disclosed, or represents that its use would not infringe privately owned - * rights. - * - * Stanford disclaimer of liability - * -------------------------------- - * Stanford University makes no representations or warranties, express or - * implied, nor assumes any liability for the use of this software. - * - * Stanford disclaimer of copyright - * -------------------------------- - * Stanford University, owner of the copyright, hereby disclaims its - * copyright and all other rights in this software. Hence, anyone may - * freely use it for any purpose without restriction. - * - * Maintenance of notices - * ---------------------- - * In the interest of clarity regarding the origin and status of this - * SLAC software, this and all the preceding Stanford University notices - * are to remain affixed to any copy or derivative of this software made - * or distributed by the recipient and are to be affixed to any copy of - * software made or distributed by the recipient that contains a copy or - * derivative of this software. - * - * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03 - */ - - -#include - -#ifndef PPC_CACHE_ALIGNMENT -#error "Missing header; PPC_CACHE_ALIGNMENT is not defined" -#endif - -#define ALTIVEC_TESTING - -#if PPC_CACHE_ALIGNMENT != 32 -#error "Altivec support assumes cache-line size is 32 bytes!" -#else -#undef LD_PPC_CACHE_ALIGNMENT -#define LD_PPC_CACHE_ALIGNMENT 5 -#endif - - .set v0, 0 - .set v8, 8 - .set v16, 16 - .set v20, 20 - .set v24, 24 - .set v28, 28 - - .set r0, 0 - .set r3, 3 - .set r4, 4 - /* Do not use r5, since this is used by _CPU_Context_switch() */ - .set r6, 6 - .set r7, 7 - .set r8, 8 - .set r9, 9 - .set r10, 10 - .set r11, 11 - /* Do not use r12, since this is used by _CPU_Context_switch() */ - - .set cr5, 5 - - .set VECSIZE, 16 - - .set VRSAVE_INIT_VAL, 0 - .set VSCR_INIT_VAL, 0 - - .set VRSAVE_OFF, 16 - .set VSCR_OFF, 16+12 - - .set ds0, 0 - - /* Block size for dst -- in units of 16-bytes */ - .set BSIZE, 2 /* = 32 bytes */ - .set BCNT, 12/2+1 /* 12 non-volatile registers + area for vscr/vrsave */ - .set BSTRIDE, 32 /* bytes */ - - .data - - .global _CPU_altivec_vrsave_initval -_CPU_altivec_vrsave_initval: - .long 0 - - .global _CPU_altivec_vscr_initval -_CPU_altivec_vscr_initval: - .long 0 - - .text - - .extern _CPU_altivec_psim_cpu - .extern _CPU_altivec_ctxt_off - - .macro CMPOFF _B0 - lis \_B0, _CPU_altivec_ctxt_off@ha - lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0) - .endm - - /* Conditionally load or store a vector _VR to - * EA(_R1|0 + _R2) - * If bit _VR (corresponding to _VR) is set in CRC - * then the load/store is performed but otherwise - * it is skipped. - * If compiled with IGNORE_VRSAVE defined then - * the load/store is done unconditionally. - * - * _OPCODE: intended to be lvx, lvxl, stvx or stvxl - * _VR : target vector register - * _R1 : base register (NOTE: _R1=r0 uses a - * implicit ZERO constant, not the contents - * of r0) for address computation. - * _R2 : 'offset' register for address computation. - * - * MODIFIES: _VR on output if a load operation is performed. - * IMPLICIT USE: CRC (unless compiled with IGNORE_VRSAVE - * defined. - */ - .macro LDST _OPCODE, _VR, _R1, _R2 -#ifndef IGNORE_VRSAVE - bc 4, \_VR, 111f -#endif - \_OPCODE \_VR, \_R1, \_R2 -111: - .endm - - /* - * Load or store four 'adjacent' vector registers. - * - * _OPCODE: intended to be lvx, lvxl, stvx or stvxl - * _VR : target vector register - * _R1 : base register (NOTE: _R1=r0 uses a - * implicit ZERO constant, not the contents - * of r0) for address computation. - * _B0 : base register 0 - * _B1 : base register 1 - * _B2 : base register 2 - * _B3 : base register 3 - * _RO : offset register - * - * memory addresses for _VR, _VR+1, _VR+2, _VR+3 - * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively. - * - * MODIFIES: _VR, _VR+1, _VR+2, _VR+3 if a load - * operation is performed. - * IMPLICIT USE: see LDST - */ - .macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO - LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO - LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO - LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO - LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO - .endm - - /* - * Preload/zero two cache lines and save 4 vector registers - * to memory. - * Note that the cache operation targets memory *past* the - * current storage area which should hopefully hit when - * This same code is executed on the next two cache lines... - * - * This code effectively does - * dcbz (_B0 + 64) - * dcbz (_B0 + 64 + 32) - * stvx _VF+0, (_B0+ 0) - * stvx _VF+1, (_B0+16) - * stvx _VF+2, (_B0+32) - * stvx _VF+3, (_B0+48) - * - * _LRU: may be 'l' or empty. The former variant should be - * used when it is conceivable that the memory area is - * unlikely to be used in the near future thus making - * it a candidate for early eviction from the caches. - * - * If it is likely that the memory area is reused soon - * (e.g., save/restore across ISR execution) then the - * 'stvx' opcode (w/o 'l' suffix) should be used. - * - * _VR: first of four target vector registers; _VR+0, - * _VR+1, _VR+2, _VR+3 are saved. - * - * _BO: base address of memory area. - * _B1: should contain _B0+16 on entry - * _B2: should contain _B0+32 on entry - * _B3: should contain _B0+48 on entry - * - * _O1: contains the offset where the four vectors are - * stored. - * _VR -> (_B0 + _O1) = (_B0 + _O1 + 0 ) - * _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 ) - * _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 ) - * _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 ) - * _O2: is set to _O1 + 64 by this macro. Hence _O2 is - * used to address the two cache-lines past the - * current memory area. - * - * MODIFIES: _O2; contains _O1 + 64 after execution of this - * code. - * - * NOTES: a different set of four vectors can be addressed - * simply by changing the one offset register _O1. - * - * Saving more than 4 registers can simply be - * achieved by expanding this macro multiple - * times with _O1 and _O2 swapped (new _O1 - * becomes _O2 = old _O1 + 64) thus stepping - * through the memory area. - * - */ - .macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 - addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT - dcbz \_B0, \_O2 - dcbz \_B2, \_O2 - LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 - .endm - - /* - * Save eight vector registers by expanding S4VEC_P twice. - * See notes for S4VEC_P above. - * - * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) - * - * MODIFIES: After execution, - * _O2 contains original _O1 + 64, - * _O1 contains original _O1 + 128 - * - * NOTES: Expanding this macro multiple times lets you save - * multiple blocks of 8 registers (no reload of _Bx / _Ox is needed). - */ - .macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2 - S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 - /* Note that the roles of _O1 and _O2 are swapped here */ - S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1 - .endm - - /* - * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1) - * - * See notes above (for S4VEC_P). - * - * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) - * MODIFIES: _O1 contains original _O1 + 256 - * _O2 contains original _O1 + 256 - 64 - */ - .macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 - S8VEC_P \_LRU _VR=v0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 - S8VEC_P \_LRU _VR=v8 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 - LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1 - .endm - - /* - * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1) - * - * See notes above (for S4VEC_P, S_V0TOV19). - * - * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) - * MODIFIES: _O1 contains original _O1 + 128 - * _O2 contains original _O1 + 128 - 64 - */ - .macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2 - S8VEC_P \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2 - LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1 - .endm - - /* - * Save all registers to memory area - * - * INPUTS: _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above) - * MODIFIES: _O1 contains original _O1 + 512 - * _O2 contains original _O1 + 512 - 64 - */ - .macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 - S8VEC_P l v0 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 - S8VEC_P l v8 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 - S8VEC_P l v16 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 - S4VEC_P l v24 \_B0 \_B1 \_B2 \_B3 \_O1 \_O2 - LDST4 stvxl v28 \_B0 \_B1 \_B2 \_B3 \_O2 - .endm - - - /* - * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively. - * We can pass either of them as arguments to another macro which - * allows us to decide if the main macro uses dcbt or not when - * we expand it... - */ - .macro DO_DCBT _RA, _RB - dcbt \_RA, \_RB - .endm - - .macro NO_DCBT _RA, _RB - .endm - - /* - * NOTE REGARDING dcbt VS dst - * - * Preloading the cache with memory areas that we soon need - * can be done either using 'dcbt' or 'dst' instructions - * "ahead of time". - * When experimenting (on a mpc7457) I found that the 'dst' - * stream instruction was very efficient if there is enough - * time to read ahead. It works well when we do a context - * switch: - * - * 1) start DST on new context to be loaded - * 2) save old context to memory - * 3) load new context from memory - * - * Because of the interleaved step 2) dst works nicely and - * 3) finds what it needs in the cache. - * - * However, in a situation when there is not much time - * to start the DST, e.g., because we want to restore - * a context out of the blue (e.g., after returning - * from and ISR): - * - * 1) save volatile registers to memory/stack - * 2) execute ISR - * 3) might do a task context switch - * 4) when returned to old task context then - * reload volatile registers from memory/stack. - * - * In this situation, preloading the target memory before - * or after step 1) makes obviously no sense because after - * 1) the registers area is most likely in the cache already. - * - * Starting preload after 2) doesn't make much sense either. - * If ISR doesn't lead to a context switch then it is quite - * likely that the register area is still in the cache. - * OTOTH, if a context switch happens then the preload after 2) - * might be useless. - * - * This leaves us at step 4) where we want to load immediately. - * In this case, I found that 'dcbt' works more efficiently - * so that's what we use when restoring volatile registers. - * - * When restoring the non-volatile VRs during a 'normal' - * context switch then we shall use DST (and no dcbt). - */ - - /* - * Symmetric to S4VEC_P above but addresses loading four - * vector registers from memory. - * - * Touches two cache lines past the current memory area - * and loads four vectors from the current area. - * - * Optionally, the DCBT operation may be omitted - * (when expanding with _DCBT=NO_DCBT). - * This is useful if the cache was already preloaded - * by another means (dst instruction). - * - * NOTE: We always use the 'LRU' form of lvx: lvxl, - * because we deem it unlikely that the context - * that was just loaded has to be saved again - * to memory in the immediate future. - * - * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded - * as explained above. - * - * MODIFIES: _O2 contains original _O1 + 64. - * _VR.._VR+3 loaded from memory. - */ - .macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 - addi \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT - /* preload/touch 2 lines at offset 64 from _B0 */ - \_DCBT \_B0, \_O2 - \_DCBT \_B2, \_O2 - /* load four vectors at off set 0 from _B0 */ - LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1 - .endm - - /* - * Symmetric to S8VEC_P; loads 8 vector registers - * from memory -- see comments above... - * - * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded - * as explained above. - * - * MODIFIES: _O1 contains original _O1 + 128. - * _O2 contains original _O1 + 64. - * _VR.._VR+7 loaded from memory. - */ - .macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 - L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 - L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1 - .endm - - /* - * Load volatile vector registers v0..v19 employing - * the DCBT to preload the cache. The rationale for - * using DCBT here but not when restoring non-volatile - * registers is explained above, see - * - * "NOTE REGARDING dcbt VS dst" - * - * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded - * as explained above. - * - * MODIFIES: _O1 contains original _O1 + 256. - * _O2 contains original _O1 + 256 - 64. - * VR0..VR19 loaded from memory. - */ - .macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2 - L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 - L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 - LDST4 lvxl, v16, \_B0, \_B1, \_B2, \_B3, \_O1 - .endm - - /* - * Load non-volatile vector registers v20..v31. - * Note that no DCBT is performed since we use - * DST for preloading the cache during a context - * switch, see - * - * "NOTE REGARDING dcbt VS dst" - * - * INPUTS: _B0, _B1, _B2, _B3, _O1 must be loaded - * as explained above. - * - * MODIFIES: _O1 contains original _O1 + 128. - * _O2 contains original _O1 + 128 - 64. - * VR20..VR31 loaded from memory. - */ - .macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2 - L8VEC_A NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 - LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O1 - .endm - - /* - * Load all registers from memory area. - */ - .macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2 - L8VEC_A DO_DCBT, v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 - L8VEC_A DO_DCBT, v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 - L8VEC_A DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 - L4VEC_A DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2 - LDST4 lvxl, v28, \_B0, \_B1, \_B2, \_B3, \_O2 - .endm - - /* - * Compute - * _B1 = _B0 + 16 - * _B2 = _B0 + 32 - * _B3 = _B0 + 48 - * and load - * _RO = 0 - * - * convenience macro to be expanded before - * any of the load/store macros that use - * four base addresses etc. - * - * INPUT: _B0 = cache-aligned start of memory area - * - * MODIFIES: _B1, _B2, _B3, _RO as described above. - */ - .macro CMP_BASES _B0, _B1, _B2, _B3, _RO - addi \_B1, \_B0, 1*VECSIZE - addi \_B2, \_B0, 2*VECSIZE - addi \_B3, \_B0, 3*VECSIZE - li \_RO, 0 - .endm - - /* - * Prepare for saving general vector registers. - * - * If not built with #define IGNORE_VRSAVE then - * - * 1) copy vrsave to CRC - * - * endif - * - * 2) copy vrsave to _VRSAVE_REG - * 3) preload/zero cache line where vrsave and vscr are stored. - * 4) compute base adresses from _B0 - * 5) preload/zero first two cache lines (remember that the - * first S8VEC_P starts preloading/zeroing at offset 64). - * - * INPUT: 'vrsave' register, _B0 (base address of memory area) - * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave') - * _B0 = original _BO + 32 - * _B1 = original _B0 + 32 + 16, - * _B2 = original _B0 + 32 + 32, - * _B3 = original _B0 + 32 + 48, - * CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined) - */ - .macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO - mfvrsave \_VRSAVE_REG -#ifndef IGNORE_VRSAVE - mtcr \_VRSAVE_REG -#endif - dcbz 0, \_B0 - addi \_B0, \_B0, PPC_CACHE_ALIGNMENT - dcbz 0, \_B0 - CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO - dcbz 0, \_B2 - .endm - - /* - * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers - * must have been loaded from 'vrsave' and 'vscr', respectively, - * prior to expanding this macro. - * - * INPUTS: _VRSAVE_REG GPR holding 'vrsave' contents - * _VSCR_VREG VR holding 'vscr' contents - * _B0 cache-aligned (base) address of memory area. - * MODIFIES: _SCRATCH_REG - */ - .macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG - stw \_VRSAVE_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) - li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF - stvewx \_VSCR_VREG, \_B0, \_SCRATCH_REG - .endm - - /* - * Load 'vrsave' and 'vscr' from memory. - * - * INPUTS: _B0 cache-aligned (base) address of memory area. - * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr) - * 'vscr', 'vrsave'. - * CRC (holds contents of 'vrsave') (ONLY IF COMPILED - * with IGNORE_VRSAVE undefined). - */ - .macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG - lwz \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0) - mtvrsave \_SCRATCH_REG -#ifndef IGNORE_VRSAVE - mtcr \_SCRATCH_REG -#endif - li \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF - lvewx \_SCRATCH_VREG, \_B0, \_SCRATCH_REG - mtvscr \_SCRATCH_VREG - .endm - - /* - * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1) - * - * INPUT: _B0 - * MODIFIES: _B0 (as stated above) - */ - .macro CACHE_DOWNALGN _B0 - rlwinm \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT - .endm - - .text - - .global _CPU_save_altivec_volatile -_CPU_save_altivec_volatile: - /* Align address up to next cache-line boundary */ - addi r3, r3, PPC_CACHE_ALIGNMENT - 1 - CACHE_DOWNALGN r3 - -#ifndef IGNORE_VRSAVE - /* Save CRC -- it is used implicitly by all the LOAD/STORE macros - * when testing if we really should do the load/store operation. - */ - mfcr r9 -#endif - - PREP_FOR_SAVE r0, r3, r4, r8, r6, r10 - /* r0 now contains VRSAVE, r3 still the aligned memory area - * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3, - * respectively. r10 holds zero - */ - S_V0TOV19 _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11 - mfvscr v0 - /* Store vrsave (still in r0) and vscr (in v0) to memory area */ - S_VSCR_VRSAVE r0, v0, r3, r11 - -#ifndef IGNORE_VRSAVE - /* Restore CRC */ - mtcr r9 -#endif - blr - - .global _CPU_load_altivec_volatile -_CPU_load_altivec_volatile: - /* Align address up to next cache-line boundary */ - addi r3, r3, PPC_CACHE_ALIGNMENT - 1 - CACHE_DOWNALGN r3 -#ifndef IGNORE_VRSAVE - /* Save CRC -- it is used implicitly by all the LOAD/STORE macros - * when testing if we really should do the load/store operation. - */ - mfcr r9 -#endif - - /* Try to preload 1st line (where vscr and vrsave are stored) */ - dcbt 0, r3 - /* Point to start of general vector-register area */ - addi r3, r3, PPC_CACHE_ALIGNMENT - /* Start preloading 2nd line (where first two vectors are) */ - dcbt 0, r3 - L_VSCR_VRSAVE r3, r0, v0 - CMP_BASES r3, r4, r8, r6, r10 - /* Start preloading 3rd line (where vectors 3 and 4 are) */ - dcbt 0, r8 - L_V0TOV19 r3, r4, r8, r6, r10, r11 - -#ifndef IGNORE_VRSAVE - mtcr r9 -#endif - blr - - .global _CPU_Context_switch_altivec -_CPU_Context_switch_altivec: - - /* fetch offset of altivec area in context */ - CMPOFF r8 - /* down-align 'to' area to cache-line boundary */ - add r4, r4, r8 - CACHE_DOWNALGN r4 - - /* Check for PSIM */ - lis r6, _CPU_altivec_psim_cpu@ha - lwz r6, _CPU_altivec_psim_cpu@l(r6) - cmpli 0, r6, 0 - bne 1f - /* Skip data-stream instructions on PSIM (not implemented) */ - dssall - /* Pre-load new context into cache */ - lis r6, (BSIZE<<(24-16)) | (BCNT<<(16-16)) - ori r6, r6, BSTRIDE - dstt r4, r6, ds0 -1: - -#ifndef IGNORE_VRSAVE - /* Save CRC -- it is used implicitly by all the LOAD/STORE macros - * when testing if we really should do the load/store operation. - */ - mfcr r9 -#endif - - /* Is 'from' context == NULL ? (then we just do a 'restore') */ - cmpli 0, r3, 0 - beq 1f /* yes: skip saving 'from' context */ - - /* SAVE NON-VOLATILE REGISTERS */ - - /* Compute aligned destination pointer (r8 still holds offset - * to 'altivec' area in context) - */ - add r3, r3, r8 - CACHE_DOWNALGN r3 - - PREP_FOR_SAVE r0, r3, r8, r6, r7, r10 - /* The manual says reading vscr can take some time - do - * read it here (into a volatile vector register) while - * we wait for cache blocks to be allocated - */ - mfvscr v0 - S_V20TOV31 _LRU=l, _B0=r3, _B1=r8, _B2=r6, _B3=r7, _O1=r10, _O2=r11 - /* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */ - S_VSCR_VRSAVE r0, v0, r3, r8 - -1: - - /* LOAD NON-VOLATILE REGISTERS */ - - /* Advance past vrsave/vscr area */ - addi r4, r4, PPC_CACHE_ALIGNMENT - L_VSCR_VRSAVE r4, r0, v0 - CMP_BASES r4, r8, r6, r7, r10 - L_V20TOV31 r4, r8, r6, r7, r10, r11 - -#ifndef IGNORE_VRSAVE - mtcr r9 -#endif - blr - - .global _CPU_Context_initialize_altivec -_CPU_Context_initialize_altivec: - CMPOFF r8 - add r3, r3, r8 - CACHE_DOWNALGN r3 - lis r8, _CPU_altivec_vrsave_initval@ha - lwz r8, _CPU_altivec_vrsave_initval@l(r8) - stw r8, VRSAVE_OFF(r3) - lis r6, _CPU_altivec_vscr_initval@ha - lwz r6, _CPU_altivec_vscr_initval@l(r6) - stw r6, VSCR_OFF(r3) - blr - - /* - * Change the initial value of VRSAVE. - * Can be used by initialization code if - * it is determined that code was compiled - * with -mvrsave=no. In this case, VRSAVE - * must be set to all-ones which causes this - * support code to save/restore *all* registers - * (only has an effect if IGNORE_VRSAVE is - * not defined -- otherwise all registers are - * saved/restored anyways). - */ - .global _CPU_altivec_set_vrsave_initval -_CPU_altivec_set_vrsave_initval: - lis r8, _CPU_altivec_vrsave_initval@ha - stw r3, _CPU_altivec_vrsave_initval@l(r8) - mtvrsave r3 - blr - -#ifdef ALTIVEC_TESTING - .global msr_VE_on -msr_VE_on: - mfmsr r3 - oris r3, r3, 1<<(31-6-16) - mtmsr r3 - blr - - .global msr_VE_off -msr_VE_off: - mfmsr r3 - lis r4, 1<<(31-6-16) - andc r3, r3, r4 - mtmsr r3 - blr - - - .global mfvrsave -mfvrsave: - mfvrsave r3 - blr - - .global mtvrsave -mtvrsave: - mtvrsave r3 - blr - - /* Load all vector registers from memory area. - * NOTE: This routine is not strictly ABI compliant -- - * it guarantees that volatile vector registers - * have certain values on exit! - */ - .global _CPU_altivec_load_all -_CPU_altivec_load_all: - /* Align address up to next cache-line boundary */ - addi r3, r3, PPC_CACHE_ALIGNMENT - 1 - CACHE_DOWNALGN r3 -#ifndef IGNORE_VRSAVE - /* Save CRC -- it is used implicitly by all the LOAD/STORE macros - * when testing if we really should do the load/store operation. - */ - mfcr r9 -#endif - - /* Try to preload 1st line (where vscr and vrsave are stored) */ - dcbt 0, r3 - /* Point to start of general vector-register area */ - addi r3, r3, PPC_CACHE_ALIGNMENT - /* Start preloading 2nd line (where first two vectors are) */ - dcbt 0, r3 - L_VSCR_VRSAVE r3, r0, v0 - CMP_BASES r3, r4, r8, r6, r10 - /* Start preloading 3rd line (where vectors 3 and 4 are) */ - dcbt 0, r8 - L_V0TOV31 r3, r4, r8, r6, r10, r11 - -#ifndef IGNORE_VRSAVE - mtcr r9 -#endif - blr - - .global _CPU_altivec_save_all -_CPU_altivec_save_all: - /* Align address up to next cache-line boundary */ - addi r3, r3, PPC_CACHE_ALIGNMENT - 1 - CACHE_DOWNALGN r3 - -#ifndef IGNORE_VRSAVE - /* Save CRC -- it is used implicitly by all the LOAD/STORE macros - * when testing if we really should do the load/store operation. - */ - mfcr r9 -#endif - - PREP_FOR_SAVE r0, r3, r4, r8, r6, r10 - /* r0 now contains VRSAVE, r3 still the aligned memory area - * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3, - * respectively. r10 holds zero - */ - S_V0TOV31 _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11 - mfvscr v0 - /* Store vrsave (still in r0) and vscr (in v0) to memory area */ - S_VSCR_VRSAVE r0, v0, r3, r11 - -#ifndef IGNORE_VRSAVE - /* Restore CRC */ - mtcr r9 -#endif - blr - - -#if 0 - .gnu_attribute 4,1 - .gnu_attribute 8,1 -#endif - -#endif -#endif -- cgit v1.2.3