From 4fd1ff0f0d8d1e3029f488a011acd83115dccdef Mon Sep 17 00:00:00 2001
From: Sebastian Huber <sebastian.huber@embedded-brains.de>
Date: Mon, 26 Mar 2018 06:57:10 +0200
Subject: bsps/powerpc: Move AltiVec support to bsps

This patch is a part of the BSP source reorganization.

Update #3285.
---
 bsps/powerpc/shared/altivec/README                 | 184 +++++
 bsps/powerpc/shared/altivec/vec_sup.c              | 273 +++++++
 bsps/powerpc/shared/altivec/vec_sup_asm.S          | 821 +++++++++++++++++++++
 c/src/lib/libbsp/powerpc/beatnik/Makefile.am       |  12 +-
 .../libbsp/powerpc/motorola_powerpc/Makefile.am    |   6 +-
 c/src/lib/libbsp/powerpc/mvme5500/Makefile.am      |  10 +-
 c/src/lib/libbsp/powerpc/psim/Makefile.am          |   4 +-
 c/src/lib/libcpu/powerpc/Makefile.am               |   9 -
 c/src/lib/libcpu/powerpc/mpc6xx/altivec/README     | 184 -----
 c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c  | 273 -------
 .../libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S    | 821 ---------------------
 11 files changed, 1291 insertions(+), 1306 deletions(-)
 create mode 100644 bsps/powerpc/shared/altivec/README
 create mode 100644 bsps/powerpc/shared/altivec/vec_sup.c
 create mode 100644 bsps/powerpc/shared/altivec/vec_sup_asm.S
 delete mode 100644 c/src/lib/libcpu/powerpc/mpc6xx/altivec/README
 delete mode 100644 c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c
 delete mode 100644 c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S

diff --git a/bsps/powerpc/shared/altivec/README b/bsps/powerpc/shared/altivec/README
new file mode 100644
index 0000000000..61ebb8dded
--- /dev/null
+++ b/bsps/powerpc/shared/altivec/README
@@ -0,0 +1,184 @@
+RTEMS ALTIVEC SUPPORT 
+=====================
+
+1. History
+----------
+
+Altivec support was developed and maintained as a user-extension
+outside of RTEMS. This extension is still available (unbundled)
+from Till Straumann <strauman@slac.stanford.edu>; it is useful
+if an application desires 'lazy switching' of the altivec context.
+
+2. Modes
+--------
+
+Altivec support -- the unbundled extension, that is -- can be used
+in two ways:
+
+a. All tasks are implicitly AltiVec-enabled.
+
+b. Only designated tasks are AltiVec-enabled. 'Lazy-context switching'
+   is implemented to switch AltiVec the context.
+
+Note that the code implemented in this directory supports mode 'a'
+and mode 'a' ONLY. For mode 'b' you need the unbundled extension
+(which is completely independent of this code).
+
+Mode 'a' (All tasks are AltiVec-enabled)
+- - - - - - - - - - - - - - - - - - - - -
+
+The major disadvantage of this mode is that additional overhead is 
+involved: tasks that never use the vector unit still save/restore
+the volatile vector registers (20 registers * 16bytes each) across
+every interrupt and all non-volatile registers (12 registers * 16b each)
+during every context switch.
+
+However, saving/restoring e.g., the volatile registers is quite
+fast -- on my 1GHz 7457 saving or restoring 20 vector registers
+takes only about 1us or even less (if there are cache hits).
+
+The advantage is complete transparency to the user and full ABI
+compatibility (exept for ISRs and exception handlers), see below.
+
+Mode 'b' (Only dedicated tasks are AltiVec-enabled)
+- - - - - - - - - - - - - - - - - - - - - - - - - -
+
+The advantage of this mode of operation is that the vector-registers
+are only saved/restored when a different, altivec-enabled task becomes
+ready to run. In particular, if there is only a single altivec-enabled
+task then the altivec-context *never* is switched.
+
+Note that this mode of operation is not supported by the code
+in this directory -- you need the unbundled altivec extension
+mentioned above.
+
+3. Compiler Options
+------------------- 
+
+Three compiler options affect AltiVec: -maltivec, -mabi=altivec and
+-mvrsave=yes/no.
+
+-maltivec: This lets the cpp define the symbol __ALTIVEC__ and enables
+           gcc to emit vector instructions. Note that gcc may use the
+           AltiVec engine implicitly, i.e., **without you writing any
+           vectorized code**.
+
+-mabi=altivec: This option has two effects:
+           i) It ensures 16-byte stack alignment required by AltiVec
+              (even in combination with eabi which is RTEMS' default).
+           ii) It allows vector arguments to be passed in vector registers.
+
+-mvrsave=yes/no: Instructs gcc to emit code which sets the VRSAVE register
+           indicating which vector registers are 'currently in use'.
+           Because the altivec support does not use this information *) the
+           option has no direct affect but it is desirable to compile with
+           -mvrsave=no so that no unnecessary code is generated.
+
+          *) The file vec_sup_asm.S conditionally disables usage of
+             the VRSAVE information if the preprocessor symbol
+             'IGNORE_VRSAVE' is defined, which is the default.
+
+             If 'IGNORE_VRSAVE' is undefined then the code *does*
+             use the VRSAVE information but I found that this does
+             not execute noticeably faster.
+
+IMPORTANT NOTES
+===============
+
+AFAIK, RTEMS uses the EABI which requires a stack alignment of only 8 bytes
+which is NOT enough for AltiVec (which requires 16-byte alignment).
+
+There are two ways for obtaining 16-byte alignment:
+
+I)  Compile with -mno-eabi (ordinary SYSV ABI has 16-byte alignment)
+II) Compile with -mabi=altivec (extension to EABI; maintains 16-byte alignment
+    but also allows for passing vector arguments in vector registers)
+
+Note that it is crucial to compile ***absolutely everything*** with the same
+ABI options (or a linker error may occur). In particular, this includes
+
+ - newlibc multilib variant
+ - RTEMS proper 
+ - application + third-party code
+
+IMO the proper compiler options for Mode 'a' would be
+
+    -maltivec -mabi=altivec -mvrsave=no
+
+Note that the -mcpu=7400 option also enables -maltivec and -mabi=altivec
+but leaves -mvrsave at some 'default' which is probably 'no'.
+Compiling with -mvrsave=yes does not produce incompatible code but
+may have a performance impact (since extra code is produced to maintain
+VRSAVE).
+
+4. Multilib Variants
+--------------------
+
+The default GCC configuration for RTEMS contains a -mcpu=7400 multilib
+variant which is the correct one to choose.
+
+5. BSP 'custom' file.
+---------------------
+
+Now that you have the necessary newlib and libgcc etc. variants
+you also need to build RTEMS accordingly.
+
+In you BSP's make/custom/<bsp>.cfg file make sure the CPU_CFLAGS
+select the desired variant:
+
+for mode 'a':
+
+   CPU_CFLAGS = ... -mcpu=7400
+
+Note that since -maltivec globally defines __ALTIVEC__ RTEMS automatially
+enables code that takes care of switching the AltiVec context as necessary.
+This is transparent to application code.
+
+6. BSP support
+--------------
+
+It is the BSP's responsibility to initialize MSR_VE, VSCR and VRSAVE
+during early boot, ideally before any C-code is executed (because it
+may, theoretically, use vector instructions).
+
+The BSP must
+
+ - set MSR_VE
+ - clear VRSAVE; note that the probing algorithm for detecting
+   whether -mvrsave=yes or 'no' was used relies on the BSP
+   clearing VRSAVE during early start. Since no interrupts or
+   context switches happen before the AltiVec support is initialized
+   clearing VRSAVE is no problem even if it turns out that -mvrsave=no
+   was in effect (eventually a value of all-ones will be stored
+   in VRSAVE in this case).
+ - clear VSCR
+
+7. PSIM note
+------------
+
+PSIM supports the AltiVec instruction set with the exception of
+the 'data stream' instructions for cache prefetching. The RTEMS
+altivec support includes run-time checks to skip these instruction
+when executing on PSIM.
+
+Note that AltiVec support within PSIM must be enabled at 'configure'
+time by passing the 'configure' option
+
+--enable-sim-float=altivec
+
+Note also that PSIM's AltiVec support has many bugs. It is recommended
+to apply the patches filed as an attachment with gdb bug report #2461
+prior to building PSIM.
+
+The CPU type and corresponding multilib must be changed when
+building RTEMS/psim:
+
+  edit make/custom/psim.cfg and change
+
+    CPU_CFLAGS = ... -mcpu=603e
+
+  to
+
+    CPU_CFLAGS = ... -mcpu=7400
+
+This change must be performed *before* configuring RTEMS/psim.
diff --git a/bsps/powerpc/shared/altivec/vec_sup.c b/bsps/powerpc/shared/altivec/vec_sup.c
new file mode 100644
index 0000000000..141779c175
--- /dev/null
+++ b/bsps/powerpc/shared/altivec/vec_sup.c
@@ -0,0 +1,273 @@
+/* Altivec support for RTEMS; vector register context management.  */
+
+/*
+ * Authorship
+ * ----------
+ * This software was created by
+ *     Till Straumann <strauman@slac.stanford.edu>, 2009,
+ * 	   Stanford Linear Accelerator Center, Stanford University.
+ *
+ * Acknowledgement of sponsorship
+ * ------------------------------
+ * This software was produced by
+ *     the Stanford Linear Accelerator Center, Stanford University,
+ * 	   under Contract DE-AC03-76SFO0515 with the Department of Energy.
+ *
+ * Government disclaimer of liability
+ * ----------------------------------
+ * Neither the United States nor the United States Department of Energy,
+ * nor any of their employees, makes any warranty, express or implied, or
+ * assumes any legal liability or responsibility for the accuracy,
+ * completeness, or usefulness of any data, apparatus, product, or process
+ * disclosed, or represents that its use would not infringe privately owned
+ * rights.
+ *
+ * Stanford disclaimer of liability
+ * --------------------------------
+ * Stanford University makes no representations or warranties, express or
+ * implied, nor assumes any liability for the use of this software.
+ *
+ * Stanford disclaimer of copyright
+ * --------------------------------
+ * Stanford University, owner of the copyright, hereby disclaims its
+ * copyright and all other rights in this software.  Hence, anyone may
+ * freely use it for any purpose without restriction.
+ *
+ * Maintenance of notices
+ * ----------------------
+ * In the interest of clarity regarding the origin and status of this
+ * SLAC software, this and all the preceding Stanford University notices
+ * are to remain affixed to any copy or derivative of this software made
+ * or distributed by the recipient and are to be affixed to any copy of
+ * software made or distributed by the recipient that contains a copy or
+ * derivative of this software.
+ *
+ * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
+ */
+
+#ifdef __ALTIVEC__
+
+#include <rtems.h>
+#include <libcpu/cpuIdent.h>
+#include <rtems/bspIo.h>
+#include <rtems/error.h>
+#include <rtems/score/cpu.h>
+#include <rtems/powerpc/powerpc.h>
+
+#define STATIC static
+
+#define VEC_ALIGNMENT	16
+
+#define NAM				"AltiVec Support"
+#define ERRID(a,b,c,d)	(((a)<<24) | ((b)<<16) | ((c)<<8) | (d))
+
+typedef uint32_t _vu32 __attribute__((vector_size(VEC_ALIGNMENT)));
+
+#ifndef MSR_VE
+#define MSR_VE	(1<<(31-6))
+#endif
+
+/* NOTE: These two variables are accessed by assembly code
+ *       which assumes 32-bit data!
+ */
+uint32_t _CPU_altivec_ctxt_off = 0;
+uint32_t _CPU_altivec_psim_cpu = 0;
+
+static inline uint32_t
+mfmsr(void)
+{
+uint32_t v;	
+	_CPU_MSR_GET(v);
+	return v;
+}
+
+static inline void
+mtmsr(uint32_t v)
+{
+	_CPU_MSR_SET(v);
+}
+
+static inline void
+isync(void)
+{
+	asm volatile("isync");
+}
+
+static inline void
+dssall(void)
+{
+	if ( !_CPU_altivec_psim_cpu)
+		asm volatile("dssall");
+}
+
+static inline uint32_t
+set_MSR_VE(void)
+{
+uint32_t rval;
+	rval=mfmsr();
+	if ( ! (MSR_VE & rval ) ) {
+		mtmsr(rval | MSR_VE);
+		isync();
+	}
+	return rval;
+}
+
+static inline void
+clr_MSR_VE(void)
+{
+	dssall();
+	mtmsr(mfmsr() & ~MSR_VE);
+	isync();
+}
+
+static inline void
+rst_MSR_VE(uint32_t old)
+{
+	if ( ! ( MSR_VE & old ) ) {
+		dssall();
+		mtmsr(old);
+		isync();
+	}
+}
+
+
+/* Code to probe the compiler's stack alignment (PowerPC);
+ * The routine determines at run-time if the compiler generated
+ * 8 or 16-byte aligned code.
+ *
+ * Till Straumann <strauman@slac.stanford.edu>, 2005
+ */
+
+static void dummy(void) __attribute__((noinline));
+/* add (empty) asm-statement to make sure this isn't optimized away */
+static void dummy(void) { __asm__ volatile(""); }
+
+static unsigned probe_r1(void) __attribute__((noinline));
+static unsigned probe_r1(void)
+{
+unsigned r1;
+	/* call something to enforce creation of a minimal stack frame;
+     * (8 bytes: r1 and lr space for 'dummy' callee). If compiled
+     * with -meabi -mno-altivec gcc allocates 8 bytes, if -mno-eabi
+     * or -maltivec / -mabi=altivec then gcc allocates 16 bytes
+     * according to the sysv / altivec ABI specs.
+     */
+	dummy();
+	/* return stack pointer */
+	asm volatile("mr %0,1":"=r"(r1));
+	return r1;
+}
+
+static unsigned
+probe_ppc_stack_alignment(void)
+{
+unsigned r1;
+	asm volatile("mr %0,1":"=r"(r1));
+	return (r1 - probe_r1()) & ~ 0xf;
+}
+
+STATIC int check_stack_alignment(void)
+{
+int rval = 0;
+	if ( VEC_ALIGNMENT > PPC_STACK_ALIGNMENT ) {
+		printk(NAM": CPU support has unsufficient stack alignment;\n");
+		printk("modify 'cpukit/score/cpu/powerpc/rtems/score/powerpc.h'\n");
+		printk("and choose PPC_ABI_SVR4. I'll enable a workaround for now.\n");
+		rval |= 1;
+	}
+	/* Run-time check; should compile with -mabi=altivec */
+	if ( probe_ppc_stack_alignment() < VEC_ALIGNMENT ) {
+		printk(NAM": run-time stack alignment unsufficient; make sure you compile with -mabi=altivec\n");
+		rval |= 2;
+	}
+	return rval;
+}
+
+
+static uint32_t probe_vrsave(_vu32 *p_v) __attribute__((noinline));
+
+/* Check if this code was compiled with -mvrsave=yes or no 
+ * so that we can set the default/init value accordingly.
+ */
+static uint32_t probe_vrsave(_vu32 *p_v)
+{
+_vu32     x;
+uint32_t vrsave;
+	/* Explicitly clobber a volatile vector reg (0) that is
+	 * not used to pass return values.
+	 * If -mvrsave=yes was used this should cause gcc to
+	 * set bit 0 in vrsave. OTOH this bit cannot be set
+	 * because v0 is volatile and not used to pass a value
+	 * to the caller...
+	 */
+	asm volatile("vxor %0, 0, 0; mfvrsave %1":"=v"(x),"=r"(vrsave)::"v0");
+	if ( p_v ) {
+		*p_v = x;
+	}
+	return vrsave;
+}
+
+static int vrsave_yes(void) __attribute__((noinline));
+
+static int vrsave_yes(void)
+{
+uint32_t vrsave_pre;
+	asm volatile("mfvrsave %0":"=r"(vrsave_pre));
+	if ( (vrsave_pre & 0x80000000) ) {
+		printk(NAM": WARNING - unable to determine whether -mvrsave was used; assuming NO\n");
+		return 0;
+	}
+	return probe_vrsave(0) != vrsave_pre;
+}
+
+extern void
+_CPU_altivec_set_vrsave_initval(uint32_t);
+
+
+void
+_CPU_Initialize_altivec(void)
+{
+unsigned          pvr;
+
+	/* I don't like to have to #define the offset of the altivec area
+	 * for use by assembly code.
+	 * Therefore, we compute it here and store it in memory...
+	 */
+	_CPU_altivec_ctxt_off  = offsetof(ppc_context, altivec);
+
+	/*
+	 * See ppc_get_context() and PPC_CONTEXT_OFFSET_GPR1
+	 */
+	_CPU_altivec_ctxt_off += PPC_DEFAULT_CACHE_LINE_SIZE;
+
+	/* 
+	 * Add space possibly needed for alignment
+	 */
+	_CPU_altivec_ctxt_off += PPC_CACHE_ALIGNMENT - 1;
+
+	if ( ! vrsave_yes() ) {
+		/* They seemed to compile with -mvrsave=no. Hence we
+		 * must set VRSAVE so that all registers are saved/restored
+		 * in case this support was not built with IGNORE_VRSAVE.
+		 */
+		_CPU_altivec_set_vrsave_initval( -1 );
+	}
+
+	if ( check_stack_alignment() & 2 )
+		rtems_fatal_error_occurred(ERRID('V','E','C','1'));
+
+	pvr                   = get_ppc_cpu_type();
+	/* psim has altivec but lacks the streaming instructions :-( */
+	_CPU_altivec_psim_cpu = (PPC_PSIM == pvr);
+
+	if ( ! ppc_cpu_has_altivec() ) {
+		printk(NAM": This CPU seems not to have AltiVec\n");
+		rtems_panic("Unable to initialize AltiVec Support\n");
+	}
+
+	if ( ! (mfmsr() & MSR_VE) ) {
+		printk(NAM": Warning: BSP should set MSR_VE early; doing it now...\n");
+		set_MSR_VE();	
+	}
+}
+#endif
diff --git a/bsps/powerpc/shared/altivec/vec_sup_asm.S b/bsps/powerpc/shared/altivec/vec_sup_asm.S
new file mode 100644
index 0000000000..279d1704a7
--- /dev/null
+++ b/bsps/powerpc/shared/altivec/vec_sup_asm.S
@@ -0,0 +1,821 @@
+#ifdef __ALTIVEC__
+
+/* Altivec support for RTEMS; vector register context management.  */
+
+/*
+ * Authorship
+ * ----------
+ * This software was created by
+ *     Till Straumann <strauman@slac.stanford.edu>, 2009,
+ * 	   Stanford Linear Accelerator Center, Stanford University.
+ *
+ * Acknowledgement of sponsorship
+ * ------------------------------
+ * This software was produced by
+ *     the Stanford Linear Accelerator Center, Stanford University,
+ * 	   under Contract DE-AC03-76SFO0515 with the Department of Energy.
+ *
+ * Government disclaimer of liability
+ * ----------------------------------
+ * Neither the United States nor the United States Department of Energy,
+ * nor any of their employees, makes any warranty, express or implied, or
+ * assumes any legal liability or responsibility for the accuracy,
+ * completeness, or usefulness of any data, apparatus, product, or process
+ * disclosed, or represents that its use would not infringe privately owned
+ * rights.
+ *
+ * Stanford disclaimer of liability
+ * --------------------------------
+ * Stanford University makes no representations or warranties, express or
+ * implied, nor assumes any liability for the use of this software.
+ *
+ * Stanford disclaimer of copyright
+ * --------------------------------
+ * Stanford University, owner of the copyright, hereby disclaims its
+ * copyright and all other rights in this software.  Hence, anyone may
+ * freely use it for any purpose without restriction.
+ *
+ * Maintenance of notices
+ * ----------------------
+ * In the interest of clarity regarding the origin and status of this
+ * SLAC software, this and all the preceding Stanford University notices
+ * are to remain affixed to any copy or derivative of this software made
+ * or distributed by the recipient and are to be affixed to any copy of
+ * software made or distributed by the recipient that contains a copy or
+ * derivative of this software.
+ *
+ * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
+ */
+
+
+#include <rtems/powerpc/powerpc.h>
+
+#ifndef PPC_CACHE_ALIGNMENT
+#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
+#endif
+
+#define ALTIVEC_TESTING
+
+#if PPC_CACHE_ALIGNMENT != 32
+#error "Altivec support assumes cache-line size is 32 bytes!"
+#else
+#undef  LD_PPC_CACHE_ALIGNMENT
+#define LD_PPC_CACHE_ALIGNMENT 5
+#endif
+
+	.set   v0,   0
+	.set   v8,   8
+	.set   v16, 16
+	.set   v20, 20
+	.set   v24, 24
+	.set   v28, 28
+
+	.set   r0,   0
+	.set   r3,   3
+	.set   r4,   4
+	/* Do not use r5, since this is used by _CPU_Context_switch() */
+	.set   r6,   6
+	.set   r7,   7
+	.set   r8,   8
+	.set   r9,   9
+	.set   r10, 10
+	.set   r11, 11
+	/* Do not use r12, since this is used by _CPU_Context_switch() */
+
+	.set   cr5,  5
+
+	.set   VECSIZE,    16
+
+	.set   VRSAVE_INIT_VAL, 0
+	.set   VSCR_INIT_VAL,   0
+
+	.set   VRSAVE_OFF, 16
+	.set   VSCR_OFF,   16+12
+
+	.set   ds0,  0
+
+	/* Block size for dst -- in units of 16-bytes */
+	.set   BSIZE,   2       /* = 32 bytes */
+	.set   BCNT,    12/2+1  /* 12 non-volatile registers + area for vscr/vrsave */
+	.set   BSTRIDE, 32      /*      bytes */
+
+	.data
+
+	.global _CPU_altivec_vrsave_initval
+_CPU_altivec_vrsave_initval:
+	.long   0
+
+	.global _CPU_altivec_vscr_initval
+_CPU_altivec_vscr_initval:
+	.long   0
+
+	.text
+
+	.extern _CPU_altivec_psim_cpu
+	.extern _CPU_altivec_ctxt_off
+
+	.macro  CMPOFF _B0
+	lis	\_B0, _CPU_altivec_ctxt_off@ha
+	lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
+	.endm
+
+	/* Conditionally load or store a vector _VR to
+     *  EA(_R1|0 + _R2)
+	 * If bit _VR (corresponding to _VR) is set in CRC
+	 * then the load/store is performed but otherwise
+	 * it is skipped.
+	 * If compiled with IGNORE_VRSAVE defined then
+	 * the load/store is done unconditionally.
+	 *
+ 	 * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
+	 * _VR    : target vector register
+	 * _R1    : base register (NOTE: _R1=r0 uses a
+	 *          implicit ZERO constant, not the contents
+	 *          of r0) for address computation.
+	 * _R2    : 'offset' register for address computation.
+	 *
+	 * MODIFIES:      _VR on output if a load operation is performed.
+	 * IMPLICIT USE:  CRC (unless compiled with IGNORE_VRSAVE
+	 *                defined.
+	 */
+	.macro LDST _OPCODE, _VR, _R1, _R2
+#ifndef IGNORE_VRSAVE
+	bc       4, \_VR, 111f
+#endif
+	\_OPCODE \_VR, \_R1, \_R2
+111:
+	.endm
+
+	/*
+	 * Load or store four 'adjacent' vector registers.
+	 *
+	 * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
+	 * _VR    : target vector register
+	 * _R1    : base register (NOTE: _R1=r0 uses a
+	 *          implicit ZERO constant, not the contents
+	 *          of r0) for address computation.
+	 * _B0    : base register 0
+	 * _B1    : base register 1
+	 * _B2    : base register 2
+	 * _B3    : base register 3
+	 * _RO    : offset register
+	 *
+ 	 * memory addresses for _VR, _VR+1, _VR+2, _VR+3
+	 * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
+	 *
+	 * MODIFIES:      _VR, _VR+1, _VR+2, _VR+3 if a load
+	 *                operation is performed.
+	 * IMPLICIT USE:  see LDST
+	 */
+	.macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
+	LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
+	LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
+	LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
+	LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
+	.endm
+
+	/*
+	 * Preload/zero two cache lines and save 4 vector registers
+	 * to memory.
+ 	 * Note that the cache operation targets memory *past* the
+	 * current storage area which should hopefully hit when
+	 * This same code is executed on the next two cache lines...
+	 *
+	 * This code effectively does
+	 *   dcbz (_B0 + 64)
+	 *   dcbz (_B0 + 64 + 32)
+	 *   stvx _VF+0, (_B0+ 0)
+	 *   stvx _VF+1, (_B0+16)
+	 *   stvx _VF+2, (_B0+32)
+	 *   stvx _VF+3, (_B0+48)
+	 *
+	 * _LRU:  may be 'l' or empty. The former variant should be
+	 *        used when it is conceivable that the memory area is
+	 *        unlikely to be used in the near future thus making
+	 *        it a candidate for early eviction from the caches.
+	 *
+	 *        If it is likely that the memory area is reused soon
+	 *        (e.g., save/restore across ISR execution) then the
+	 *        'stvx' opcode (w/o 'l' suffix) should be used.
+	 *
+ 	 * _VR:   first of four target vector registers; _VR+0,
+	 *        _VR+1, _VR+2, _VR+3 are saved.
+	 *
+	 * _BO:   base address of memory area.
+	 * _B1:   should contain _B0+16 on entry
+	 * _B2:   should contain _B0+32 on entry
+	 * _B3:   should contain _B0+48 on entry
+	 *
+	 * _O1:   contains the offset where the four vectors are
+	 *        stored. 
+	 *          _VR  -> (_B0 + _O1) = (_B0 + _O1 +  0 )
+	 *          _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
+	 *          _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
+	 *          _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
+	 * _O2:   is set to _O1 + 64 by this macro. Hence _O2 is
+     *        used to address the two cache-lines past the
+	 *        current memory area.
+	 *
+ 	 * MODIFIES: _O2; contains _O1 + 64 after execution of this
+	 *        code.
+	 *
+	 * NOTES: a different set of four vectors can be addressed
+	 *        simply by changing the one offset register _O1.
+	 *
+	 *        Saving more than 4 registers can simply be
+	 *        achieved by expanding this macro multiple 
+	 *        times with _O1 and _O2 swapped (new _O1 
+	 *        becomes _O2 = old _O1 + 64) thus stepping
+	 *        through the memory area.
+	 *
+	 */
+	.macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
+	addi  \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
+	dcbz  \_B0, \_O2
+	dcbz  \_B2, \_O2
+	LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
+	.endm
+
+	/*
+	 * Save eight vector registers by expanding S4VEC_P twice.
+	 * See notes for S4VEC_P above.
+	 *
+	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
+	 *
+	 * MODIFIES: After execution, 
+	 *           _O2 contains original _O1 +  64,
+	 *           _O1 contains original _O1 + 128
+	 *
+	 * NOTES:    Expanding this macro multiple times lets you save
+	 *           multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
+	 */
+	.macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
+	S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
+	/* Note that the roles of _O1 and _O2 are swapped here */
+	S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
+	.endm
+
+	/*
+	 * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
+	 *
+	 * See notes above (for S4VEC_P).
+	 *
+	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
+	 * MODIFIES: _O1 contains original _O1 + 256
+	 *           _O2 contains original _O1 + 256 - 64
+	 */
+	.macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
+	S8VEC_P   \_LRU _VR=v0  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
+	S8VEC_P   \_LRU _VR=v8  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
+	LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
+	.endm
+
+	/*
+	 * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
+	 *
+	 * See notes above (for S4VEC_P, S_V0TOV19).
+	 *
+	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
+	 * MODIFIES: _O1 contains original _O1 + 128
+	 *           _O2 contains original _O1 + 128 - 64
+	 */
+	.macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
+	S8VEC_P   \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
+	LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
+	.endm
+
+	/*
+	 * Save all registers to memory area
+	 *
+	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
+	 * MODIFIES: _O1 contains original _O1 + 512
+	 *           _O2 contains original _O1 + 512 - 64
+	 */
+	.macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
+	S8VEC_P   l  v0  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
+	S8VEC_P   l  v8  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
+	S8VEC_P   l v16  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
+	S4VEC_P   l v24  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
+	LDST4 stvxl v28  \_B0 \_B1 \_B2 \_B3 \_O2
+	.endm
+
+
+	/*
+	 * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
+	 * We can pass either of them as arguments to another macro which
+	 * allows us to decide if the main macro uses dcbt or not when
+	 * we expand it...
+	 */
+	.macro DO_DCBT _RA, _RB
+	dcbt \_RA, \_RB
+	.endm
+
+	.macro NO_DCBT _RA, _RB
+	.endm
+
+	/*
+	 * NOTE REGARDING dcbt VS dst
+	 *
+	 * Preloading the cache with memory areas that we soon need
+	 * can be done either using 'dcbt' or 'dst' instructions
+	 * "ahead of time".
+	 * When experimenting (on a mpc7457) I found that the 'dst'
+	 * stream instruction was very efficient if there is enough
+	 * time to read ahead. It works well when we do a context
+	 * switch:
+	 *
+	 *   1) start DST on new context to be loaded
+	 *   2) save old context to memory
+	 *   3) load new context from memory
+	 *
+	 * Because of the interleaved step 2) dst works nicely and
+	 * 3) finds what it needs in the cache.
+	 *
+	 * However, in a situation when there is not much time
+	 * to start the DST, e.g., because we want to restore
+	 * a context out of the blue (e.g., after returning
+	 * from and ISR):
+	 *
+	 *   1) save volatile registers to memory/stack
+	 *   2) execute ISR
+	 *   3) might do a task context switch
+	 *   4) when returned to old task context then
+	 *      reload volatile registers from memory/stack.
+	 *
+	 * In this situation, preloading the target memory before
+	 * or after step 1) makes obviously no sense because after
+	 * 1) the registers area is most likely in the cache already.
+	 *
+	 * Starting preload after 2) doesn't make much sense either.
+	 * If ISR doesn't lead to a context switch then it is quite
+	 * likely that the register area is still in the cache.
+	 * OTOTH, if a context switch happens then the preload after 2)
+	 * might be useless.
+	 * 
+	 * This leaves us at step 4) where we want to load immediately.
+	 * In this case, I found that 'dcbt' works more efficiently
+	 * so that's what we use when restoring volatile registers.
+	 *
+	 * When restoring the non-volatile VRs during a 'normal'
+	 * context switch then we shall use DST (and no dcbt).
+	 */
+
+	/*
+	 * Symmetric to S4VEC_P above but addresses loading four
+	 * vector registers from memory.
+	 *
+	 * Touches two cache lines past the current memory area
+	 * and loads four vectors from the current area.
+	 *
+	 * Optionally, the DCBT operation may be omitted
+	 * (when expanding with _DCBT=NO_DCBT).
+	 * This is useful if the cache was already preloaded
+	 * by another means (dst instruction).
+	 *
+	 * NOTE: We always use the 'LRU' form of lvx: lvxl, 
+	 *       because we deem it unlikely that the context
+	 *       that was just loaded has to be saved again
+	 *       to memory in the immediate future.
+	 *
+	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
+	 *           as explained above.
+	 *
+	 * MODIFIES: _O2 contains original _O1 + 64.
+	 *           _VR.._VR+3 loaded from memory.
+	 */
+	.macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 
+	addi        \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
+	/* preload/touch 2 lines at offset 64 from _B0 */
+	\_DCBT   \_B0, \_O2
+	\_DCBT   \_B2, \_O2
+	/* load four vectors at off set 0 from _B0     */
+	LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
+	.endm
+
+	/*
+	 * Symmetric to S8VEC_P; loads 8 vector registers
+	 * from memory -- see comments above...
+	 *
+	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
+	 *           as explained above.
+	 *
+	 * MODIFIES: _O1 contains original _O1 + 128.
+	 *           _O2 contains original _O1 +  64.
+	 *           _VR.._VR+7 loaded from memory.
+	 */
+	.macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 
+	L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
+	L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
+	.endm
+	
+	/*
+	 * Load volatile vector registers v0..v19 employing
+	 * the DCBT to preload the cache. The rationale for
+	 * using DCBT here but not when restoring non-volatile
+	 * registers is explained above, see
+	 *
+	 *    "NOTE REGARDING dcbt VS dst"
+	 * 
+	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
+	 *           as explained above.
+	 *
+	 * MODIFIES: _O1 contains original _O1 + 256.
+	 *           _O2 contains original _O1 + 256 - 64.
+	 *           VR0..VR19 loaded from memory.
+	 */ 
+	.macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
+	L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
+	L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
+	LDST4    lvxl,    v16, \_B0, \_B1, \_B2, \_B3, \_O1
+	.endm
+
+	/*
+	 * Load non-volatile vector registers v20..v31.
+	 * Note that no DCBT is performed since we use
+	 * DST for preloading the cache during a context
+	 * switch, see
+	 *
+	 *    "NOTE REGARDING dcbt VS dst"
+	 *
+	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
+	 *           as explained above.
+	 *
+	 * MODIFIES: _O1 contains original _O1 + 128.
+	 *           _O2 contains original _O1 + 128 - 64.
+	 *           VR20..VR31 loaded from memory.
+	 */
+	.macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
+	L8VEC_A  NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
+	LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O1
+	.endm
+
+	/*
+	 * Load all registers from memory area.
+	 */
+	.macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
+	L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
+	L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
+	L8VEC_A  DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
+	L4VEC_A  DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
+	LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O2
+	.endm
+
+	/*
+	 * Compute
+	 *     _B1 = _B0 + 16
+	 *     _B2 = _B0 + 32
+	 *     _B3 = _B0 + 48
+	 * and load
+	 *     _RO = 0
+	 *
+	 * convenience macro to be expanded before
+	 * any of the load/store macros that use
+	 * four base addresses etc.
+	 *
+	 * INPUT: _B0 = cache-aligned start of memory area
+	 *
+	 * MODIFIES: _B1, _B2, _B3, _RO as described above.
+	 */
+	.macro CMP_BASES _B0, _B1, _B2, _B3, _RO
+	addi       \_B1, \_B0, 1*VECSIZE
+	addi       \_B2, \_B0, 2*VECSIZE
+	addi       \_B3, \_B0, 3*VECSIZE
+	li         \_RO, 0
+	.endm
+
+	/*
+	 * Prepare for saving general vector registers.
+	 *
+	 * If not built with #define IGNORE_VRSAVE then
+	 *
+	 *  1) copy vrsave to CRC
+	 *
+	 * endif
+	 *
+	 *  2) copy vrsave to _VRSAVE_REG
+	 *  3) preload/zero cache line where vrsave and vscr are stored.
+	 *  4) compute base adresses from _B0
+	 *  5) preload/zero first two cache lines (remember that the
+	 *     first S8VEC_P starts preloading/zeroing at offset 64).
+	 *
+	 * INPUT:    'vrsave' register, _B0 (base address of memory area)
+	 * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
+	 *           _B0 = original _BO + 32
+	 *           _B1 = original _B0 + 32 + 16,
+	 *           _B2 = original _B0 + 32 + 32,
+	 *           _B3 = original _B0 + 32 + 48,
+	 *           CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
+	 */
+	.macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
+	mfvrsave   \_VRSAVE_REG
+#ifndef IGNORE_VRSAVE
+	mtcr       \_VRSAVE_REG
+#endif
+	dcbz       0, \_B0
+	addi       \_B0, \_B0, PPC_CACHE_ALIGNMENT
+	dcbz       0, \_B0
+	CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
+	dcbz       0, \_B2
+	.endm
+
+	/*
+	 * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
+	 * must have been loaded from 'vrsave' and 'vscr', respectively,
+	 * prior to expanding this macro.
+	 *
+	 * INPUTS:   _VRSAVE_REG GPR holding 'vrsave' contents
+	 *           _VSCR_VREG  VR  holding 'vscr'   contents
+	 *           _B0 cache-aligned (base) address of memory area.
+	 * MODIFIES: _SCRATCH_REG
+	 */
+	.macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
+	stw       \_VRSAVE_REG,   - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
+	li        \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VSCR_OFF
+	stvewx    \_VSCR_VREG,    \_B0, \_SCRATCH_REG
+	.endm
+
+	/*
+	 * Load 'vrsave' and 'vscr' from memory.
+	 *
+	 * INPUTS:   _B0 cache-aligned (base) address of memory area.
+	 * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
+	 *           'vscr', 'vrsave'.
+	 *           CRC (holds contents of 'vrsave') (ONLY IF COMPILED
+	 *           with IGNORE_VRSAVE undefined).
+	 */
+	.macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
+	lwz       \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
+	mtvrsave  \_SCRATCH_REG
+#ifndef IGNORE_VRSAVE
+	mtcr      \_SCRATCH_REG
+#endif
+	li        \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
+	lvewx     \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
+	mtvscr    \_SCRATCH_VREG
+	.endm
+
+	/*
+	 * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
+	 *
+	 * INPUT:    _B0
+	 * MODIFIES: _B0 (as stated above)
+	 */
+	.macro CACHE_DOWNALGN _B0
+	rlwinm    \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
+	.endm
+
+	.text
+
+	.global _CPU_save_altivec_volatile
+_CPU_save_altivec_volatile:
+	/* Align address up to next cache-line boundary */
+	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
+	CACHE_DOWNALGN r3
+
+#ifndef IGNORE_VRSAVE
+	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
+	 * when testing if we really should do the load/store operation.
+	 */
+	mfcr      r9
+#endif
+
+	PREP_FOR_SAVE r0, r3, r4, r8, r6, r10
+	/* r0 now contains VRSAVE, r3 still the aligned memory area
+	 * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3,
+	 * respectively. r10 holds zero
+	 */
+	S_V0TOV19     _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11
+	mfvscr        v0
+	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
+	S_VSCR_VRSAVE r0, v0, r3, r11
+
+#ifndef IGNORE_VRSAVE
+	/* Restore CRC */
+	mtcr      r9
+#endif
+	blr
+
+	.global _CPU_load_altivec_volatile
+_CPU_load_altivec_volatile:
+	/* Align address up to next cache-line boundary */
+	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
+	CACHE_DOWNALGN r3
+#ifndef IGNORE_VRSAVE
+	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
+	 * when testing if we really should do the load/store operation.
+	 */
+	mfcr      r9
+#endif
+
+	/* Try to preload 1st line (where vscr and vrsave are stored) */
+	dcbt      0, r3
+	/* Point to start of general vector-register area             */
+	addi      r3, r3, PPC_CACHE_ALIGNMENT
+	/* Start preloading 2nd line (where first two vectors are)    */
+	dcbt      0, r3
+	L_VSCR_VRSAVE r3, r0, v0
+	CMP_BASES     r3, r4, r8, r6, r10
+	/* Start preloading 3rd line (where vectors 3 and 4 are)      */
+	dcbt      0, r8
+	L_V0TOV19 r3, r4, r8, r6, r10, r11
+
+#ifndef IGNORE_VRSAVE
+	mtcr      r9
+#endif
+	blr
+
+	.global _CPU_Context_switch_altivec
+_CPU_Context_switch_altivec:
+
+	/* fetch offset of altivec area in context                   */
+	CMPOFF    r8
+	/* down-align 'to' area to cache-line boundary               */
+	add       r4, r4, r8
+	CACHE_DOWNALGN r4
+
+	/* Check for PSIM                                            */
+	lis       r6, _CPU_altivec_psim_cpu@ha
+	lwz       r6, _CPU_altivec_psim_cpu@l(r6)
+	cmpli     0, r6, 0
+	bne       1f
+	/* Skip data-stream instructions on PSIM (not implemented)   */
+	dssall
+	/* Pre-load new context into cache                           */
+	lis       r6, (BSIZE<<(24-16)) | (BCNT<<(16-16))
+	ori       r6, r6, BSTRIDE
+	dstt      r4, r6, ds0
+1:
+
+#ifndef IGNORE_VRSAVE
+	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
+	 * when testing if we really should do the load/store operation.
+	 */
+	mfcr      r9
+#endif
+
+	/* Is 'from' context == NULL ? (then we just do a 'restore') */
+	cmpli     0, r3, 0
+	beq       1f           /* yes: skip saving 'from' context    */
+
+	/* SAVE NON-VOLATILE REGISTERS                               */
+
+	/* Compute aligned destination pointer (r8 still holds offset
+	 * to 'altivec' area in context)
+	 */
+	add       r3, r3, r8
+	CACHE_DOWNALGN r3
+
+	PREP_FOR_SAVE r0, r3, r8, r6, r7, r10
+	/* The manual says reading vscr can take some time - do 
+	 * read it here (into a volatile vector register) while
+	 * we wait for cache blocks to be allocated
+	 */
+	mfvscr    v0
+	S_V20TOV31 _LRU=l, _B0=r3, _B1=r8, _B2=r6, _B3=r7, _O1=r10, _O2=r11
+	/* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
+	S_VSCR_VRSAVE r0, v0, r3, r8
+
+1:
+
+	/* LOAD NON-VOLATILE REGISTERS                               */
+
+	/* Advance past vrsave/vscr area                             */
+	addi      r4, r4, PPC_CACHE_ALIGNMENT
+	L_VSCR_VRSAVE r4, r0, v0
+	CMP_BASES r4, r8, r6, r7, r10
+	L_V20TOV31 r4, r8, r6, r7, r10, r11
+
+#ifndef IGNORE_VRSAVE
+	mtcr      r9
+#endif
+	blr
+
+	.global _CPU_Context_initialize_altivec
+_CPU_Context_initialize_altivec:
+	CMPOFF    r8
+	add       r3, r3, r8
+	CACHE_DOWNALGN r3
+	lis       r8, _CPU_altivec_vrsave_initval@ha
+	lwz       r8, _CPU_altivec_vrsave_initval@l(r8)
+	stw       r8, VRSAVE_OFF(r3)
+	lis       r6, _CPU_altivec_vscr_initval@ha
+	lwz       r6, _CPU_altivec_vscr_initval@l(r6)
+	stw       r6, VSCR_OFF(r3)
+	blr
+
+	/*
+	 * Change the initial value of VRSAVE.
+	 * Can be used by initialization code if
+	 * it is determined that code was compiled
+	 * with -mvrsave=no. In this case, VRSAVE
+	 * must be set to all-ones which causes this
+	 * support code to save/restore *all* registers
+	 * (only has an effect if IGNORE_VRSAVE is
+	 * not defined -- otherwise all registers are
+	 * saved/restored anyways).
+	 */
+	.global _CPU_altivec_set_vrsave_initval
+_CPU_altivec_set_vrsave_initval:
+	lis       r8, _CPU_altivec_vrsave_initval@ha
+	stw       r3, _CPU_altivec_vrsave_initval@l(r8)
+	mtvrsave  r3
+	blr
+
+#ifdef ALTIVEC_TESTING
+	.global msr_VE_on
+msr_VE_on:
+	mfmsr r3
+	oris  r3, r3, 1<<(31-6-16)
+	mtmsr r3
+	blr
+
+	.global msr_VE_off
+msr_VE_off:
+	mfmsr r3
+	lis   r4,  1<<(31-6-16)
+	andc  r3, r3, r4
+	mtmsr r3
+	blr
+
+
+	.global mfvrsave
+mfvrsave:
+	mfvrsave r3
+	blr
+
+	.global mtvrsave
+mtvrsave:
+	mtvrsave r3
+	blr
+
+	/* Load all vector registers from memory area.
+	 * NOTE: This routine is not strictly ABI compliant --
+	 *       it guarantees that volatile vector registers
+	 *       have certain values on exit!
+	 */
+	.global _CPU_altivec_load_all
+_CPU_altivec_load_all:
+	/* Align address up to next cache-line boundary */
+	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
+	CACHE_DOWNALGN r3
+#ifndef IGNORE_VRSAVE
+	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
+	 * when testing if we really should do the load/store operation.
+	 */
+	mfcr      r9
+#endif
+
+	/* Try to preload 1st line (where vscr and vrsave are stored) */
+	dcbt      0, r3
+	/* Point to start of general vector-register area             */
+	addi      r3, r3, PPC_CACHE_ALIGNMENT
+	/* Start preloading 2nd line (where first two vectors are)    */
+	dcbt      0, r3
+	L_VSCR_VRSAVE r3, r0, v0
+	CMP_BASES     r3, r4, r8, r6, r10
+	/* Start preloading 3rd line (where vectors 3 and 4 are)      */
+	dcbt      0, r8
+	L_V0TOV31 r3, r4, r8, r6, r10, r11
+
+#ifndef IGNORE_VRSAVE
+	mtcr      r9
+#endif
+	blr
+
+	.global _CPU_altivec_save_all
+_CPU_altivec_save_all:
+	/* Align address up to next cache-line boundary */
+	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
+	CACHE_DOWNALGN r3
+
+#ifndef IGNORE_VRSAVE
+	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
+	 * when testing if we really should do the load/store operation.
+	 */
+	mfcr      r9
+#endif
+
+	PREP_FOR_SAVE r0, r3, r4, r8, r6, r10
+	/* r0 now contains VRSAVE, r3 still the aligned memory area
+	 * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3,
+	 * respectively. r10 holds zero
+	 */
+	S_V0TOV31     _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11
+	mfvscr        v0
+	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
+	S_VSCR_VRSAVE r0, v0, r3, r11
+
+#ifndef IGNORE_VRSAVE
+	/* Restore CRC */
+	mtcr      r9
+#endif
+	blr
+
+
+#if 0
+	.gnu_attribute 4,1
+	.gnu_attribute 8,1
+#endif
+
+#endif
+#endif
diff --git a/c/src/lib/libbsp/powerpc/beatnik/Makefile.am b/c/src/lib/libbsp/powerpc/beatnik/Makefile.am
index 08bec33bb7..fa29c8c9ac 100644
--- a/c/src/lib/libbsp/powerpc/beatnik/Makefile.am
+++ b/c/src/lib/libbsp/powerpc/beatnik/Makefile.am
@@ -158,11 +158,16 @@ network_if_em.rel: network_if_em_tmp.rel
 	$(OBJCOPY) -G rtems_em_attach -G net_driver_ticks_per_sec \
                    -G rtems_em_pci_setup -G rtems_em_early_link_check_ops \
                    $^ $@
+
+libbsp_a_LIBADD = network_support.rel \
+    network_if_mve.rel network_if_gfe.rel network_if_em.rel
 endif
 
 # tod
 libbsp_a_SOURCES += ../../shared/tod.c tod/todcfg.c
 
+libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup.c
+libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup_asm.S
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/cache/cache.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/dev/clock-ppc-dec.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/bat.c
@@ -171,13 +176,6 @@ libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/pte121.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/irq/ppc-irq-legacy.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/ppc-dec-timer.c
 
-libbsp_a_LIBADD = ../../../libcpu/@RTEMS_CPU@/mpc6xx/altivec.rel
-
-if HAS_NETWORKING
-libbsp_a_LIBADD += network_support.rel \
-    network_if_mve.rel network_if_gfe.rel network_if_em.rel
-endif
-
 EXTRA_DIST += README LICENSE
 
 include $(top_srcdir)/../../../../automake/local.am
diff --git a/c/src/lib/libbsp/powerpc/motorola_powerpc/Makefile.am b/c/src/lib/libbsp/powerpc/motorola_powerpc/Makefile.am
index aa27a39c72..e3be115a23 100644
--- a/c/src/lib/libbsp/powerpc/motorola_powerpc/Makefile.am
+++ b/c/src/lib/libbsp/powerpc/motorola_powerpc/Makefile.am
@@ -114,6 +114,8 @@ libbsp_a_SOURCES += ../../i386/pc386/ne2000/ne2000.c
 endif
 endif
 
+libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup.c
+libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup_asm.S
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/cache/cache.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/dev/clock-ppc-dec.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/bat.c
@@ -122,9 +124,7 @@ libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/pte121.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/irq/ppc-irq-legacy.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/ppc-dec-timer.c
 
-libbsp_a_LIBADD = \
-    polledIO.rel \
-    ../../../libcpu/@RTEMS_CPU@/mpc6xx/altivec.rel
+libbsp_a_LIBADD = polledIO.rel
 
 EXTRA_DIST += BOOTING README.mtx603e README.MVME2100 README.MVME2300 \
               README.MVME2400 \
diff --git a/c/src/lib/libbsp/powerpc/mvme5500/Makefile.am b/c/src/lib/libbsp/powerpc/mvme5500/Makefile.am
index 1ad78e0776..450153a739 100644
--- a/c/src/lib/libbsp/powerpc/mvme5500/Makefile.am
+++ b/c/src/lib/libbsp/powerpc/mvme5500/Makefile.am
@@ -66,6 +66,7 @@ network_rel_SOURCES = network/if_100MHz/GT64260eth.c \
     network/if_1GHz/if_wm.c network/if_1GHz/pci_map.c
 network_rel_CPPFLAGS = $(AM_CPPFLAGS) $(network_CPPFLAGS)
 network_rel_LDFLAGS = $(RTEMS_RELLDFLAGS)
+libbsp_a_LIBADD = network.rel
 endif
 
 EXTRA_DIST += ../../powerpc/shared/start/rtems_crti.S
@@ -83,6 +84,8 @@ project_lib_DATA += mvme5500start.$(OBJEXT)
 project_lib_DATA += linkcmds
 dist_project_lib_DATA += ../shared/startup/linkcmds.share
 
+libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup.c
+libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup_asm.S
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/cache/cache.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/dev/clock-ppc-dec.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/bat.c
@@ -91,13 +94,6 @@ libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/pte121.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/irq/ppc-irq-legacy.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/ppc-dec-timer.c
 
-libbsp_a_LIBADD = \
-    ../../../libcpu/@RTEMS_CPU@/mpc6xx/altivec.rel
-
-if HAS_NETWORKING
-libbsp_a_LIBADD += network.rel
-endif
-
 include $(top_srcdir)/../../../../automake/local.am
 include $(srcdir)/../../../../../../bsps/powerpc/shared/shared.am
 include $(srcdir)/../../../../../../bsps/powerpc/shared/exceptions.am
diff --git a/c/src/lib/libbsp/powerpc/psim/Makefile.am b/c/src/lib/libbsp/powerpc/psim/Makefile.am
index 2192daba95..49b809cd1b 100644
--- a/c/src/lib/libbsp/powerpc/psim/Makefile.am
+++ b/c/src/lib/libbsp/powerpc/psim/Makefile.am
@@ -60,6 +60,8 @@ if HAS_NETWORKING
 libbsp_a_SOURCES += network/if_sim.c
 endif
 
+libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup.c
+libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/altivec/vec_sup_asm.S
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/cache/cache.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/dev/clock-ppc-dec.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/bat.c
@@ -68,8 +70,6 @@ libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/mmu/pte121.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/irq/ppc-irq-legacy.c
 libbsp_a_SOURCES += ../../../../../../bsps/powerpc/shared/ppc-dec-timer.c
 
-libbsp_a_LIBADD = ../../../libcpu/@RTEMS_CPU@/mpc6xx/altivec.rel
-
 include $(top_srcdir)/../../../../automake/local.am
 include $(srcdir)/../../../../../../bsps/powerpc/shared/shared.am
 include $(srcdir)/../../../../../../bsps/powerpc/shared/exceptions.am
diff --git a/c/src/lib/libcpu/powerpc/Makefile.am b/c/src/lib/libcpu/powerpc/Makefile.am
index 1c15f2cc59..9d19f0e13d 100644
--- a/c/src/lib/libcpu/powerpc/Makefile.am
+++ b/c/src/lib/libcpu/powerpc/Makefile.am
@@ -39,13 +39,4 @@ if ppc405
 ## ppc4xx/include
 endif # ppc405
 
-if mpc6xx
-# mpc6xx/altivec
-noinst_PROGRAMS += mpc6xx/altivec.rel
-mpc6xx_altivec_rel_SOURCES = mpc6xx/altivec/vec_sup.c mpc6xx/altivec/vec_sup_asm.S
-mpc6xx_altivec_rel_CPPFLAGS = $(AM_CPPFLAGS)
-mpc6xx_altivec_rel_LDFLAGS = $(RTEMS_RELLDFLAGS)
-endif
-EXTRA_DIST      += mpc6xx/altivec/README
-
 include $(top_srcdir)/../../../automake/local.am
diff --git a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/README b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/README
deleted file mode 100644
index 61ebb8dded..0000000000
--- a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/README
+++ /dev/null
@@ -1,184 +0,0 @@
-RTEMS ALTIVEC SUPPORT 
-=====================
-
-1. History
-----------
-
-Altivec support was developed and maintained as a user-extension
-outside of RTEMS. This extension is still available (unbundled)
-from Till Straumann <strauman@slac.stanford.edu>; it is useful
-if an application desires 'lazy switching' of the altivec context.
-
-2. Modes
---------
-
-Altivec support -- the unbundled extension, that is -- can be used
-in two ways:
-
-a. All tasks are implicitly AltiVec-enabled.
-
-b. Only designated tasks are AltiVec-enabled. 'Lazy-context switching'
-   is implemented to switch AltiVec the context.
-
-Note that the code implemented in this directory supports mode 'a'
-and mode 'a' ONLY. For mode 'b' you need the unbundled extension
-(which is completely independent of this code).
-
-Mode 'a' (All tasks are AltiVec-enabled)
-- - - - - - - - - - - - - - - - - - - - -
-
-The major disadvantage of this mode is that additional overhead is 
-involved: tasks that never use the vector unit still save/restore
-the volatile vector registers (20 registers * 16bytes each) across
-every interrupt and all non-volatile registers (12 registers * 16b each)
-during every context switch.
-
-However, saving/restoring e.g., the volatile registers is quite
-fast -- on my 1GHz 7457 saving or restoring 20 vector registers
-takes only about 1us or even less (if there are cache hits).
-
-The advantage is complete transparency to the user and full ABI
-compatibility (exept for ISRs and exception handlers), see below.
-
-Mode 'b' (Only dedicated tasks are AltiVec-enabled)
-- - - - - - - - - - - - - - - - - - - - - - - - - -
-
-The advantage of this mode of operation is that the vector-registers
-are only saved/restored when a different, altivec-enabled task becomes
-ready to run. In particular, if there is only a single altivec-enabled
-task then the altivec-context *never* is switched.
-
-Note that this mode of operation is not supported by the code
-in this directory -- you need the unbundled altivec extension
-mentioned above.
-
-3. Compiler Options
-------------------- 
-
-Three compiler options affect AltiVec: -maltivec, -mabi=altivec and
--mvrsave=yes/no.
-
--maltivec: This lets the cpp define the symbol __ALTIVEC__ and enables
-           gcc to emit vector instructions. Note that gcc may use the
-           AltiVec engine implicitly, i.e., **without you writing any
-           vectorized code**.
-
--mabi=altivec: This option has two effects:
-           i) It ensures 16-byte stack alignment required by AltiVec
-              (even in combination with eabi which is RTEMS' default).
-           ii) It allows vector arguments to be passed in vector registers.
-
--mvrsave=yes/no: Instructs gcc to emit code which sets the VRSAVE register
-           indicating which vector registers are 'currently in use'.
-           Because the altivec support does not use this information *) the
-           option has no direct affect but it is desirable to compile with
-           -mvrsave=no so that no unnecessary code is generated.
-
-          *) The file vec_sup_asm.S conditionally disables usage of
-             the VRSAVE information if the preprocessor symbol
-             'IGNORE_VRSAVE' is defined, which is the default.
-
-             If 'IGNORE_VRSAVE' is undefined then the code *does*
-             use the VRSAVE information but I found that this does
-             not execute noticeably faster.
-
-IMPORTANT NOTES
-===============
-
-AFAIK, RTEMS uses the EABI which requires a stack alignment of only 8 bytes
-which is NOT enough for AltiVec (which requires 16-byte alignment).
-
-There are two ways for obtaining 16-byte alignment:
-
-I)  Compile with -mno-eabi (ordinary SYSV ABI has 16-byte alignment)
-II) Compile with -mabi=altivec (extension to EABI; maintains 16-byte alignment
-    but also allows for passing vector arguments in vector registers)
-
-Note that it is crucial to compile ***absolutely everything*** with the same
-ABI options (or a linker error may occur). In particular, this includes
-
- - newlibc multilib variant
- - RTEMS proper 
- - application + third-party code
-
-IMO the proper compiler options for Mode 'a' would be
-
-    -maltivec -mabi=altivec -mvrsave=no
-
-Note that the -mcpu=7400 option also enables -maltivec and -mabi=altivec
-but leaves -mvrsave at some 'default' which is probably 'no'.
-Compiling with -mvrsave=yes does not produce incompatible code but
-may have a performance impact (since extra code is produced to maintain
-VRSAVE).
-
-4. Multilib Variants
---------------------
-
-The default GCC configuration for RTEMS contains a -mcpu=7400 multilib
-variant which is the correct one to choose.
-
-5. BSP 'custom' file.
----------------------
-
-Now that you have the necessary newlib and libgcc etc. variants
-you also need to build RTEMS accordingly.
-
-In you BSP's make/custom/<bsp>.cfg file make sure the CPU_CFLAGS
-select the desired variant:
-
-for mode 'a':
-
-   CPU_CFLAGS = ... -mcpu=7400
-
-Note that since -maltivec globally defines __ALTIVEC__ RTEMS automatially
-enables code that takes care of switching the AltiVec context as necessary.
-This is transparent to application code.
-
-6. BSP support
---------------
-
-It is the BSP's responsibility to initialize MSR_VE, VSCR and VRSAVE
-during early boot, ideally before any C-code is executed (because it
-may, theoretically, use vector instructions).
-
-The BSP must
-
- - set MSR_VE
- - clear VRSAVE; note that the probing algorithm for detecting
-   whether -mvrsave=yes or 'no' was used relies on the BSP
-   clearing VRSAVE during early start. Since no interrupts or
-   context switches happen before the AltiVec support is initialized
-   clearing VRSAVE is no problem even if it turns out that -mvrsave=no
-   was in effect (eventually a value of all-ones will be stored
-   in VRSAVE in this case).
- - clear VSCR
-
-7. PSIM note
-------------
-
-PSIM supports the AltiVec instruction set with the exception of
-the 'data stream' instructions for cache prefetching. The RTEMS
-altivec support includes run-time checks to skip these instruction
-when executing on PSIM.
-
-Note that AltiVec support within PSIM must be enabled at 'configure'
-time by passing the 'configure' option
-
---enable-sim-float=altivec
-
-Note also that PSIM's AltiVec support has many bugs. It is recommended
-to apply the patches filed as an attachment with gdb bug report #2461
-prior to building PSIM.
-
-The CPU type and corresponding multilib must be changed when
-building RTEMS/psim:
-
-  edit make/custom/psim.cfg and change
-
-    CPU_CFLAGS = ... -mcpu=603e
-
-  to
-
-    CPU_CFLAGS = ... -mcpu=7400
-
-This change must be performed *before* configuring RTEMS/psim.
diff --git a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c
deleted file mode 100644
index 141779c175..0000000000
--- a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup.c
+++ /dev/null
@@ -1,273 +0,0 @@
-/* Altivec support for RTEMS; vector register context management.  */
-
-/*
- * Authorship
- * ----------
- * This software was created by
- *     Till Straumann <strauman@slac.stanford.edu>, 2009,
- * 	   Stanford Linear Accelerator Center, Stanford University.
- *
- * Acknowledgement of sponsorship
- * ------------------------------
- * This software was produced by
- *     the Stanford Linear Accelerator Center, Stanford University,
- * 	   under Contract DE-AC03-76SFO0515 with the Department of Energy.
- *
- * Government disclaimer of liability
- * ----------------------------------
- * Neither the United States nor the United States Department of Energy,
- * nor any of their employees, makes any warranty, express or implied, or
- * assumes any legal liability or responsibility for the accuracy,
- * completeness, or usefulness of any data, apparatus, product, or process
- * disclosed, or represents that its use would not infringe privately owned
- * rights.
- *
- * Stanford disclaimer of liability
- * --------------------------------
- * Stanford University makes no representations or warranties, express or
- * implied, nor assumes any liability for the use of this software.
- *
- * Stanford disclaimer of copyright
- * --------------------------------
- * Stanford University, owner of the copyright, hereby disclaims its
- * copyright and all other rights in this software.  Hence, anyone may
- * freely use it for any purpose without restriction.
- *
- * Maintenance of notices
- * ----------------------
- * In the interest of clarity regarding the origin and status of this
- * SLAC software, this and all the preceding Stanford University notices
- * are to remain affixed to any copy or derivative of this software made
- * or distributed by the recipient and are to be affixed to any copy of
- * software made or distributed by the recipient that contains a copy or
- * derivative of this software.
- *
- * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
- */
-
-#ifdef __ALTIVEC__
-
-#include <rtems.h>
-#include <libcpu/cpuIdent.h>
-#include <rtems/bspIo.h>
-#include <rtems/error.h>
-#include <rtems/score/cpu.h>
-#include <rtems/powerpc/powerpc.h>
-
-#define STATIC static
-
-#define VEC_ALIGNMENT	16
-
-#define NAM				"AltiVec Support"
-#define ERRID(a,b,c,d)	(((a)<<24) | ((b)<<16) | ((c)<<8) | (d))
-
-typedef uint32_t _vu32 __attribute__((vector_size(VEC_ALIGNMENT)));
-
-#ifndef MSR_VE
-#define MSR_VE	(1<<(31-6))
-#endif
-
-/* NOTE: These two variables are accessed by assembly code
- *       which assumes 32-bit data!
- */
-uint32_t _CPU_altivec_ctxt_off = 0;
-uint32_t _CPU_altivec_psim_cpu = 0;
-
-static inline uint32_t
-mfmsr(void)
-{
-uint32_t v;	
-	_CPU_MSR_GET(v);
-	return v;
-}
-
-static inline void
-mtmsr(uint32_t v)
-{
-	_CPU_MSR_SET(v);
-}
-
-static inline void
-isync(void)
-{
-	asm volatile("isync");
-}
-
-static inline void
-dssall(void)
-{
-	if ( !_CPU_altivec_psim_cpu)
-		asm volatile("dssall");
-}
-
-static inline uint32_t
-set_MSR_VE(void)
-{
-uint32_t rval;
-	rval=mfmsr();
-	if ( ! (MSR_VE & rval ) ) {
-		mtmsr(rval | MSR_VE);
-		isync();
-	}
-	return rval;
-}
-
-static inline void
-clr_MSR_VE(void)
-{
-	dssall();
-	mtmsr(mfmsr() & ~MSR_VE);
-	isync();
-}
-
-static inline void
-rst_MSR_VE(uint32_t old)
-{
-	if ( ! ( MSR_VE & old ) ) {
-		dssall();
-		mtmsr(old);
-		isync();
-	}
-}
-
-
-/* Code to probe the compiler's stack alignment (PowerPC);
- * The routine determines at run-time if the compiler generated
- * 8 or 16-byte aligned code.
- *
- * Till Straumann <strauman@slac.stanford.edu>, 2005
- */
-
-static void dummy(void) __attribute__((noinline));
-/* add (empty) asm-statement to make sure this isn't optimized away */
-static void dummy(void) { __asm__ volatile(""); }
-
-static unsigned probe_r1(void) __attribute__((noinline));
-static unsigned probe_r1(void)
-{
-unsigned r1;
-	/* call something to enforce creation of a minimal stack frame;
-     * (8 bytes: r1 and lr space for 'dummy' callee). If compiled
-     * with -meabi -mno-altivec gcc allocates 8 bytes, if -mno-eabi
-     * or -maltivec / -mabi=altivec then gcc allocates 16 bytes
-     * according to the sysv / altivec ABI specs.
-     */
-	dummy();
-	/* return stack pointer */
-	asm volatile("mr %0,1":"=r"(r1));
-	return r1;
-}
-
-static unsigned
-probe_ppc_stack_alignment(void)
-{
-unsigned r1;
-	asm volatile("mr %0,1":"=r"(r1));
-	return (r1 - probe_r1()) & ~ 0xf;
-}
-
-STATIC int check_stack_alignment(void)
-{
-int rval = 0;
-	if ( VEC_ALIGNMENT > PPC_STACK_ALIGNMENT ) {
-		printk(NAM": CPU support has unsufficient stack alignment;\n");
-		printk("modify 'cpukit/score/cpu/powerpc/rtems/score/powerpc.h'\n");
-		printk("and choose PPC_ABI_SVR4. I'll enable a workaround for now.\n");
-		rval |= 1;
-	}
-	/* Run-time check; should compile with -mabi=altivec */
-	if ( probe_ppc_stack_alignment() < VEC_ALIGNMENT ) {
-		printk(NAM": run-time stack alignment unsufficient; make sure you compile with -mabi=altivec\n");
-		rval |= 2;
-	}
-	return rval;
-}
-
-
-static uint32_t probe_vrsave(_vu32 *p_v) __attribute__((noinline));
-
-/* Check if this code was compiled with -mvrsave=yes or no 
- * so that we can set the default/init value accordingly.
- */
-static uint32_t probe_vrsave(_vu32 *p_v)
-{
-_vu32     x;
-uint32_t vrsave;
-	/* Explicitly clobber a volatile vector reg (0) that is
-	 * not used to pass return values.
-	 * If -mvrsave=yes was used this should cause gcc to
-	 * set bit 0 in vrsave. OTOH this bit cannot be set
-	 * because v0 is volatile and not used to pass a value
-	 * to the caller...
-	 */
-	asm volatile("vxor %0, 0, 0; mfvrsave %1":"=v"(x),"=r"(vrsave)::"v0");
-	if ( p_v ) {
-		*p_v = x;
-	}
-	return vrsave;
-}
-
-static int vrsave_yes(void) __attribute__((noinline));
-
-static int vrsave_yes(void)
-{
-uint32_t vrsave_pre;
-	asm volatile("mfvrsave %0":"=r"(vrsave_pre));
-	if ( (vrsave_pre & 0x80000000) ) {
-		printk(NAM": WARNING - unable to determine whether -mvrsave was used; assuming NO\n");
-		return 0;
-	}
-	return probe_vrsave(0) != vrsave_pre;
-}
-
-extern void
-_CPU_altivec_set_vrsave_initval(uint32_t);
-
-
-void
-_CPU_Initialize_altivec(void)
-{
-unsigned          pvr;
-
-	/* I don't like to have to #define the offset of the altivec area
-	 * for use by assembly code.
-	 * Therefore, we compute it here and store it in memory...
-	 */
-	_CPU_altivec_ctxt_off  = offsetof(ppc_context, altivec);
-
-	/*
-	 * See ppc_get_context() and PPC_CONTEXT_OFFSET_GPR1
-	 */
-	_CPU_altivec_ctxt_off += PPC_DEFAULT_CACHE_LINE_SIZE;
-
-	/* 
-	 * Add space possibly needed for alignment
-	 */
-	_CPU_altivec_ctxt_off += PPC_CACHE_ALIGNMENT - 1;
-
-	if ( ! vrsave_yes() ) {
-		/* They seemed to compile with -mvrsave=no. Hence we
-		 * must set VRSAVE so that all registers are saved/restored
-		 * in case this support was not built with IGNORE_VRSAVE.
-		 */
-		_CPU_altivec_set_vrsave_initval( -1 );
-	}
-
-	if ( check_stack_alignment() & 2 )
-		rtems_fatal_error_occurred(ERRID('V','E','C','1'));
-
-	pvr                   = get_ppc_cpu_type();
-	/* psim has altivec but lacks the streaming instructions :-( */
-	_CPU_altivec_psim_cpu = (PPC_PSIM == pvr);
-
-	if ( ! ppc_cpu_has_altivec() ) {
-		printk(NAM": This CPU seems not to have AltiVec\n");
-		rtems_panic("Unable to initialize AltiVec Support\n");
-	}
-
-	if ( ! (mfmsr() & MSR_VE) ) {
-		printk(NAM": Warning: BSP should set MSR_VE early; doing it now...\n");
-		set_MSR_VE();	
-	}
-}
-#endif
diff --git a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S b/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S
deleted file mode 100644
index 279d1704a7..0000000000
--- a/c/src/lib/libcpu/powerpc/mpc6xx/altivec/vec_sup_asm.S
+++ /dev/null
@@ -1,821 +0,0 @@
-#ifdef __ALTIVEC__
-
-/* Altivec support for RTEMS; vector register context management.  */
-
-/*
- * Authorship
- * ----------
- * This software was created by
- *     Till Straumann <strauman@slac.stanford.edu>, 2009,
- * 	   Stanford Linear Accelerator Center, Stanford University.
- *
- * Acknowledgement of sponsorship
- * ------------------------------
- * This software was produced by
- *     the Stanford Linear Accelerator Center, Stanford University,
- * 	   under Contract DE-AC03-76SFO0515 with the Department of Energy.
- *
- * Government disclaimer of liability
- * ----------------------------------
- * Neither the United States nor the United States Department of Energy,
- * nor any of their employees, makes any warranty, express or implied, or
- * assumes any legal liability or responsibility for the accuracy,
- * completeness, or usefulness of any data, apparatus, product, or process
- * disclosed, or represents that its use would not infringe privately owned
- * rights.
- *
- * Stanford disclaimer of liability
- * --------------------------------
- * Stanford University makes no representations or warranties, express or
- * implied, nor assumes any liability for the use of this software.
- *
- * Stanford disclaimer of copyright
- * --------------------------------
- * Stanford University, owner of the copyright, hereby disclaims its
- * copyright and all other rights in this software.  Hence, anyone may
- * freely use it for any purpose without restriction.
- *
- * Maintenance of notices
- * ----------------------
- * In the interest of clarity regarding the origin and status of this
- * SLAC software, this and all the preceding Stanford University notices
- * are to remain affixed to any copy or derivative of this software made
- * or distributed by the recipient and are to be affixed to any copy of
- * software made or distributed by the recipient that contains a copy or
- * derivative of this software.
- *
- * ------------------ SLAC Software Notices, Set 4 OTT.002a, 2004 FEB 03
- */
-
-
-#include <rtems/powerpc/powerpc.h>
-
-#ifndef PPC_CACHE_ALIGNMENT
-#error "Missing header; PPC_CACHE_ALIGNMENT is not defined"
-#endif
-
-#define ALTIVEC_TESTING
-
-#if PPC_CACHE_ALIGNMENT != 32
-#error "Altivec support assumes cache-line size is 32 bytes!"
-#else
-#undef  LD_PPC_CACHE_ALIGNMENT
-#define LD_PPC_CACHE_ALIGNMENT 5
-#endif
-
-	.set   v0,   0
-	.set   v8,   8
-	.set   v16, 16
-	.set   v20, 20
-	.set   v24, 24
-	.set   v28, 28
-
-	.set   r0,   0
-	.set   r3,   3
-	.set   r4,   4
-	/* Do not use r5, since this is used by _CPU_Context_switch() */
-	.set   r6,   6
-	.set   r7,   7
-	.set   r8,   8
-	.set   r9,   9
-	.set   r10, 10
-	.set   r11, 11
-	/* Do not use r12, since this is used by _CPU_Context_switch() */
-
-	.set   cr5,  5
-
-	.set   VECSIZE,    16
-
-	.set   VRSAVE_INIT_VAL, 0
-	.set   VSCR_INIT_VAL,   0
-
-	.set   VRSAVE_OFF, 16
-	.set   VSCR_OFF,   16+12
-
-	.set   ds0,  0
-
-	/* Block size for dst -- in units of 16-bytes */
-	.set   BSIZE,   2       /* = 32 bytes */
-	.set   BCNT,    12/2+1  /* 12 non-volatile registers + area for vscr/vrsave */
-	.set   BSTRIDE, 32      /*      bytes */
-
-	.data
-
-	.global _CPU_altivec_vrsave_initval
-_CPU_altivec_vrsave_initval:
-	.long   0
-
-	.global _CPU_altivec_vscr_initval
-_CPU_altivec_vscr_initval:
-	.long   0
-
-	.text
-
-	.extern _CPU_altivec_psim_cpu
-	.extern _CPU_altivec_ctxt_off
-
-	.macro  CMPOFF _B0
-	lis	\_B0, _CPU_altivec_ctxt_off@ha
-	lwz \_B0, _CPU_altivec_ctxt_off@l(\_B0)
-	.endm
-
-	/* Conditionally load or store a vector _VR to
-     *  EA(_R1|0 + _R2)
-	 * If bit _VR (corresponding to _VR) is set in CRC
-	 * then the load/store is performed but otherwise
-	 * it is skipped.
-	 * If compiled with IGNORE_VRSAVE defined then
-	 * the load/store is done unconditionally.
-	 *
- 	 * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
-	 * _VR    : target vector register
-	 * _R1    : base register (NOTE: _R1=r0 uses a
-	 *          implicit ZERO constant, not the contents
-	 *          of r0) for address computation.
-	 * _R2    : 'offset' register for address computation.
-	 *
-	 * MODIFIES:      _VR on output if a load operation is performed.
-	 * IMPLICIT USE:  CRC (unless compiled with IGNORE_VRSAVE
-	 *                defined.
-	 */
-	.macro LDST _OPCODE, _VR, _R1, _R2
-#ifndef IGNORE_VRSAVE
-	bc       4, \_VR, 111f
-#endif
-	\_OPCODE \_VR, \_R1, \_R2
-111:
-	.endm
-
-	/*
-	 * Load or store four 'adjacent' vector registers.
-	 *
-	 * _OPCODE: intended to be lvx, lvxl, stvx or stvxl
-	 * _VR    : target vector register
-	 * _R1    : base register (NOTE: _R1=r0 uses a
-	 *          implicit ZERO constant, not the contents
-	 *          of r0) for address computation.
-	 * _B0    : base register 0
-	 * _B1    : base register 1
-	 * _B2    : base register 2
-	 * _B3    : base register 3
-	 * _RO    : offset register
-	 *
- 	 * memory addresses for _VR, _VR+1, _VR+2, _VR+3
-	 * are _B0+_RO, _B1+_RO, _B2+_RO, _B3+_RO, respectively.
-	 *
-	 * MODIFIES:      _VR, _VR+1, _VR+2, _VR+3 if a load
-	 *                operation is performed.
-	 * IMPLICIT USE:  see LDST
-	 */
-	.macro LDST4 _OPCODE, _VR, _B0, _B1, _B2, _B3, _RO
-	LDST _OPCODE=\_OPCODE _VR=\_VR+0 _R1=\_B0 _R2=\_RO
-	LDST _OPCODE=\_OPCODE _VR=\_VR+1 _R1=\_B1 _R2=\_RO
-	LDST _OPCODE=\_OPCODE _VR=\_VR+2 _R1=\_B2 _R2=\_RO
-	LDST _OPCODE=\_OPCODE _VR=\_VR+3 _R1=\_B3 _R2=\_RO
-	.endm
-
-	/*
-	 * Preload/zero two cache lines and save 4 vector registers
-	 * to memory.
- 	 * Note that the cache operation targets memory *past* the
-	 * current storage area which should hopefully hit when
-	 * This same code is executed on the next two cache lines...
-	 *
-	 * This code effectively does
-	 *   dcbz (_B0 + 64)
-	 *   dcbz (_B0 + 64 + 32)
-	 *   stvx _VF+0, (_B0+ 0)
-	 *   stvx _VF+1, (_B0+16)
-	 *   stvx _VF+2, (_B0+32)
-	 *   stvx _VF+3, (_B0+48)
-	 *
-	 * _LRU:  may be 'l' or empty. The former variant should be
-	 *        used when it is conceivable that the memory area is
-	 *        unlikely to be used in the near future thus making
-	 *        it a candidate for early eviction from the caches.
-	 *
-	 *        If it is likely that the memory area is reused soon
-	 *        (e.g., save/restore across ISR execution) then the
-	 *        'stvx' opcode (w/o 'l' suffix) should be used.
-	 *
- 	 * _VR:   first of four target vector registers; _VR+0,
-	 *        _VR+1, _VR+2, _VR+3 are saved.
-	 *
-	 * _BO:   base address of memory area.
-	 * _B1:   should contain _B0+16 on entry
-	 * _B2:   should contain _B0+32 on entry
-	 * _B3:   should contain _B0+48 on entry
-	 *
-	 * _O1:   contains the offset where the four vectors are
-	 *        stored. 
-	 *          _VR  -> (_B0 + _O1) = (_B0 + _O1 +  0 )
-	 *          _VR+1-> (_B1 + _O1) = (_B0 + _O1 + 16 )
-	 *          _VR+2-> (_B2 + _O1) = (_B0 + _O1 + 32 )
-	 *          _VR+3-> (_B3 + _O1) = (_B0 + _O1 + 48 )
-	 * _O2:   is set to _O1 + 64 by this macro. Hence _O2 is
-     *        used to address the two cache-lines past the
-	 *        current memory area.
-	 *
- 	 * MODIFIES: _O2; contains _O1 + 64 after execution of this
-	 *        code.
-	 *
-	 * NOTES: a different set of four vectors can be addressed
-	 *        simply by changing the one offset register _O1.
-	 *
-	 *        Saving more than 4 registers can simply be
-	 *        achieved by expanding this macro multiple 
-	 *        times with _O1 and _O2 swapped (new _O1 
-	 *        becomes _O2 = old _O1 + 64) thus stepping
-	 *        through the memory area.
-	 *
-	 */
-	.macro S4VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
-	addi  \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
-	dcbz  \_B0, \_O2
-	dcbz  \_B2, \_O2
-	LDST4 _OPCODE=stvx\_LRU _VR=\_VR _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
-	.endm
-
-	/*
-	 * Save eight vector registers by expanding S4VEC_P twice.
-	 * See notes for S4VEC_P above.
-	 *
-	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
-	 *
-	 * MODIFIES: After execution, 
-	 *           _O2 contains original _O1 +  64,
-	 *           _O1 contains original _O1 + 128
-	 *
-	 * NOTES:    Expanding this macro multiple times lets you save
-	 *           multiple blocks of 8 registers (no reload of _Bx / _Ox is needed).
-	 */
-	.macro S8VEC_P _LRU, _VR, _B0, _B1, _B2, _B3, _O1, _O2
-	S4VEC_P \_LRU _VR=\_VR+0 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
-	/* Note that the roles of _O1 and _O2 are swapped here */
-	S4VEC_P \_LRU _VR=\_VR+4 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O2 _O2=\_O1
-	.endm
-
-	/*
-	 * Save volatile vector registers v0..v19 to memory area starting at (_B0 + _O1)
-	 *
-	 * See notes above (for S4VEC_P).
-	 *
-	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
-	 * MODIFIES: _O1 contains original _O1 + 256
-	 *           _O2 contains original _O1 + 256 - 64
-	 */
-	.macro S_V0TOV19 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
-	S8VEC_P   \_LRU _VR=v0  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
-	S8VEC_P   \_LRU _VR=v8  _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
-	LDST4 stvx\_LRU _VR=v16 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _RO=\_O1
-	.endm
-
-	/*
-	 * Save non-volatile vector registers v20..v31 to memory area starting at (_B0 + _O1)
-	 *
-	 * See notes above (for S4VEC_P, S_V0TOV19).
-	 *
-	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
-	 * MODIFIES: _O1 contains original _O1 + 128
-	 *           _O2 contains original _O1 + 128 - 64
-	 */
-	.macro S_V20TOV31 _LRU, _B0, _B1, _B2, _B3, _O1, _O2
-	S8VEC_P   \_LRU _VR=v20 _B0=\_B0 _B1=\_B1 _B2=\_B2 _B3=\_B3 _O1=\_O1 _O2=\_O2
-	LDST4 stvx\_LRU v28 \_B0 \_B1 \_B2 \_B3 \_O1
-	.endm
-
-	/*
-	 * Save all registers to memory area
-	 *
-	 * INPUTS:   _B0, _B1, _B2, _B3, _B3, _O1 must be preloaded (see above)
-	 * MODIFIES: _O1 contains original _O1 + 512
-	 *           _O2 contains original _O1 + 512 - 64
-	 */
-	.macro S_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
-	S8VEC_P   l  v0  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
-	S8VEC_P   l  v8  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
-	S8VEC_P   l v16  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
-	S4VEC_P   l v24  \_B0 \_B1 \_B2 \_B3 \_O1 \_O2
-	LDST4 stvxl v28  \_B0 \_B1 \_B2 \_B3 \_O2
-	.endm
-
-
-	/*
-	 * Macros that expand to 'dcbt _RA, _RB' or nothing, respectively.
-	 * We can pass either of them as arguments to another macro which
-	 * allows us to decide if the main macro uses dcbt or not when
-	 * we expand it...
-	 */
-	.macro DO_DCBT _RA, _RB
-	dcbt \_RA, \_RB
-	.endm
-
-	.macro NO_DCBT _RA, _RB
-	.endm
-
-	/*
-	 * NOTE REGARDING dcbt VS dst
-	 *
-	 * Preloading the cache with memory areas that we soon need
-	 * can be done either using 'dcbt' or 'dst' instructions
-	 * "ahead of time".
-	 * When experimenting (on a mpc7457) I found that the 'dst'
-	 * stream instruction was very efficient if there is enough
-	 * time to read ahead. It works well when we do a context
-	 * switch:
-	 *
-	 *   1) start DST on new context to be loaded
-	 *   2) save old context to memory
-	 *   3) load new context from memory
-	 *
-	 * Because of the interleaved step 2) dst works nicely and
-	 * 3) finds what it needs in the cache.
-	 *
-	 * However, in a situation when there is not much time
-	 * to start the DST, e.g., because we want to restore
-	 * a context out of the blue (e.g., after returning
-	 * from and ISR):
-	 *
-	 *   1) save volatile registers to memory/stack
-	 *   2) execute ISR
-	 *   3) might do a task context switch
-	 *   4) when returned to old task context then
-	 *      reload volatile registers from memory/stack.
-	 *
-	 * In this situation, preloading the target memory before
-	 * or after step 1) makes obviously no sense because after
-	 * 1) the registers area is most likely in the cache already.
-	 *
-	 * Starting preload after 2) doesn't make much sense either.
-	 * If ISR doesn't lead to a context switch then it is quite
-	 * likely that the register area is still in the cache.
-	 * OTOTH, if a context switch happens then the preload after 2)
-	 * might be useless.
-	 * 
-	 * This leaves us at step 4) where we want to load immediately.
-	 * In this case, I found that 'dcbt' works more efficiently
-	 * so that's what we use when restoring volatile registers.
-	 *
-	 * When restoring the non-volatile VRs during a 'normal'
-	 * context switch then we shall use DST (and no dcbt).
-	 */
-
-	/*
-	 * Symmetric to S4VEC_P above but addresses loading four
-	 * vector registers from memory.
-	 *
-	 * Touches two cache lines past the current memory area
-	 * and loads four vectors from the current area.
-	 *
-	 * Optionally, the DCBT operation may be omitted
-	 * (when expanding with _DCBT=NO_DCBT).
-	 * This is useful if the cache was already preloaded
-	 * by another means (dst instruction).
-	 *
-	 * NOTE: We always use the 'LRU' form of lvx: lvxl, 
-	 *       because we deem it unlikely that the context
-	 *       that was just loaded has to be saved again
-	 *       to memory in the immediate future.
-	 *
-	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
-	 *           as explained above.
-	 *
-	 * MODIFIES: _O2 contains original _O1 + 64.
-	 *           _VR.._VR+3 loaded from memory.
-	 */
-	.macro L4VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 
-	addi        \_O2, \_O1, 2*PPC_CACHE_ALIGNMENT
-	/* preload/touch 2 lines at offset 64 from _B0 */
-	\_DCBT   \_B0, \_O2
-	\_DCBT   \_B2, \_O2
-	/* load four vectors at off set 0 from _B0     */
-	LDST4 lvxl, \_VR, \_B0, \_B1, \_B2, \_B3, \_O1
-	.endm
-
-	/*
-	 * Symmetric to S8VEC_P; loads 8 vector registers
-	 * from memory -- see comments above...
-	 *
-	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
-	 *           as explained above.
-	 *
-	 * MODIFIES: _O1 contains original _O1 + 128.
-	 *           _O2 contains original _O1 +  64.
-	 *           _VR.._VR+7 loaded from memory.
-	 */
-	.macro L8VEC_A _DCBT, _VR, _B0, _B1, _B2, _B3, _O1, _O2 
-	L4VEC_A \_DCBT, \_VR+0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
-	L4VEC_A \_DCBT, \_VR+4, \_B0, \_B1, \_B2, \_B3, \_O2, \_O1
-	.endm
-	
-	/*
-	 * Load volatile vector registers v0..v19 employing
-	 * the DCBT to preload the cache. The rationale for
-	 * using DCBT here but not when restoring non-volatile
-	 * registers is explained above, see
-	 *
-	 *    "NOTE REGARDING dcbt VS dst"
-	 * 
-	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
-	 *           as explained above.
-	 *
-	 * MODIFIES: _O1 contains original _O1 + 256.
-	 *           _O2 contains original _O1 + 256 - 64.
-	 *           VR0..VR19 loaded from memory.
-	 */ 
-	.macro L_V0TOV19 _B0, _B1, _B2, _B3, _O1, _O2
-	L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
-	L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
-	LDST4    lvxl,    v16, \_B0, \_B1, \_B2, \_B3, \_O1
-	.endm
-
-	/*
-	 * Load non-volatile vector registers v20..v31.
-	 * Note that no DCBT is performed since we use
-	 * DST for preloading the cache during a context
-	 * switch, see
-	 *
-	 *    "NOTE REGARDING dcbt VS dst"
-	 *
-	 * INPUTS:   _B0, _B1, _B2, _B3, _O1 must be loaded
-	 *           as explained above.
-	 *
-	 * MODIFIES: _O1 contains original _O1 + 128.
-	 *           _O2 contains original _O1 + 128 - 64.
-	 *           VR20..VR31 loaded from memory.
-	 */
-	.macro L_V20TOV31 _B0, _B1, _B2, _B3, _O1, _O2
-	L8VEC_A  NO_DCBT, v20, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
-	LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O1
-	.endm
-
-	/*
-	 * Load all registers from memory area.
-	 */
-	.macro L_V0TOV31 _B0, _B1, _B2, _B3, _O1, _O2
-	L8VEC_A  DO_DCBT,  v0, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
-	L8VEC_A  DO_DCBT,  v8, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
-	L8VEC_A  DO_DCBT, v16, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
-	L4VEC_A  DO_DCBT, v24, \_B0, \_B1, \_B2, \_B3, \_O1, \_O2
-	LDST4    lvxl,    v28, \_B0, \_B1, \_B2, \_B3, \_O2
-	.endm
-
-	/*
-	 * Compute
-	 *     _B1 = _B0 + 16
-	 *     _B2 = _B0 + 32
-	 *     _B3 = _B0 + 48
-	 * and load
-	 *     _RO = 0
-	 *
-	 * convenience macro to be expanded before
-	 * any of the load/store macros that use
-	 * four base addresses etc.
-	 *
-	 * INPUT: _B0 = cache-aligned start of memory area
-	 *
-	 * MODIFIES: _B1, _B2, _B3, _RO as described above.
-	 */
-	.macro CMP_BASES _B0, _B1, _B2, _B3, _RO
-	addi       \_B1, \_B0, 1*VECSIZE
-	addi       \_B2, \_B0, 2*VECSIZE
-	addi       \_B3, \_B0, 3*VECSIZE
-	li         \_RO, 0
-	.endm
-
-	/*
-	 * Prepare for saving general vector registers.
-	 *
-	 * If not built with #define IGNORE_VRSAVE then
-	 *
-	 *  1) copy vrsave to CRC
-	 *
-	 * endif
-	 *
-	 *  2) copy vrsave to _VRSAVE_REG
-	 *  3) preload/zero cache line where vrsave and vscr are stored.
-	 *  4) compute base adresses from _B0
-	 *  5) preload/zero first two cache lines (remember that the
-	 *     first S8VEC_P starts preloading/zeroing at offset 64).
-	 *
-	 * INPUT:    'vrsave' register, _B0 (base address of memory area)
-	 * MODIFIES: _VRSAVE_REG (holds contents of 'vrsave')
-	 *           _B0 = original _BO + 32
-	 *           _B1 = original _B0 + 32 + 16,
-	 *           _B2 = original _B0 + 32 + 32,
-	 *           _B3 = original _B0 + 32 + 48,
-	 *           CRC = 'vrsave' (ONLY IF COMPILED with IGNORE_VRSAVE undefined)
-	 */
-	.macro PREP_FOR_SAVE _VRSAVE_REG, _B0, _B1, _B2, _B3, _RO
-	mfvrsave   \_VRSAVE_REG
-#ifndef IGNORE_VRSAVE
-	mtcr       \_VRSAVE_REG
-#endif
-	dcbz       0, \_B0
-	addi       \_B0, \_B0, PPC_CACHE_ALIGNMENT
-	dcbz       0, \_B0
-	CMP_BASES \_B0, \_B1, \_B2, \_B3, \_RO
-	dcbz       0, \_B2
-	.endm
-
-	/*
-	 * Store _VRSAVE_REG and _VSCR_VREG to memory. These registers
-	 * must have been loaded from 'vrsave' and 'vscr', respectively,
-	 * prior to expanding this macro.
-	 *
-	 * INPUTS:   _VRSAVE_REG GPR holding 'vrsave' contents
-	 *           _VSCR_VREG  VR  holding 'vscr'   contents
-	 *           _B0 cache-aligned (base) address of memory area.
-	 * MODIFIES: _SCRATCH_REG
-	 */
-	.macro S_VSCR_VRSAVE _VRSAVE_REG, _VSCR_VREG, _B0, _SCRATCH_REG
-	stw       \_VRSAVE_REG,   - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
-	li        \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VSCR_OFF
-	stvewx    \_VSCR_VREG,    \_B0, \_SCRATCH_REG
-	.endm
-
-	/*
-	 * Load 'vrsave' and 'vscr' from memory.
-	 *
-	 * INPUTS:   _B0 cache-aligned (base) address of memory area.
-	 * MODIFIES: _SCRATCH_REG (gpr), _SCRATCH_VREG (vr)
-	 *           'vscr', 'vrsave'.
-	 *           CRC (holds contents of 'vrsave') (ONLY IF COMPILED
-	 *           with IGNORE_VRSAVE undefined).
-	 */
-	.macro L_VSCR_VRSAVE _B0, _SCRATCH_REG, _SCRATCH_VREG
-	lwz       \_SCRATCH_REG,  - PPC_CACHE_ALIGNMENT + VRSAVE_OFF(\_B0)
-	mtvrsave  \_SCRATCH_REG
-#ifndef IGNORE_VRSAVE
-	mtcr      \_SCRATCH_REG
-#endif
-	li        \_SCRATCH_REG, - PPC_CACHE_ALIGNMENT + VSCR_OFF
-	lvewx     \_SCRATCH_VREG, \_B0, \_SCRATCH_REG
-	mtvscr    \_SCRATCH_VREG
-	.endm
-
-	/*
-	 * _B0 &= ~ (PPC_CACHE_ALIGNMENT - 1)
-	 *
-	 * INPUT:    _B0
-	 * MODIFIES: _B0 (as stated above)
-	 */
-	.macro CACHE_DOWNALGN _B0
-	rlwinm    \_B0, \_B0, 0, 0, 31-LD_PPC_CACHE_ALIGNMENT
-	.endm
-
-	.text
-
-	.global _CPU_save_altivec_volatile
-_CPU_save_altivec_volatile:
-	/* Align address up to next cache-line boundary */
-	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
-	CACHE_DOWNALGN r3
-
-#ifndef IGNORE_VRSAVE
-	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
-	 * when testing if we really should do the load/store operation.
-	 */
-	mfcr      r9
-#endif
-
-	PREP_FOR_SAVE r0, r3, r4, r8, r6, r10
-	/* r0 now contains VRSAVE, r3 still the aligned memory area
-	 * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3,
-	 * respectively. r10 holds zero
-	 */
-	S_V0TOV19     _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11
-	mfvscr        v0
-	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
-	S_VSCR_VRSAVE r0, v0, r3, r11
-
-#ifndef IGNORE_VRSAVE
-	/* Restore CRC */
-	mtcr      r9
-#endif
-	blr
-
-	.global _CPU_load_altivec_volatile
-_CPU_load_altivec_volatile:
-	/* Align address up to next cache-line boundary */
-	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
-	CACHE_DOWNALGN r3
-#ifndef IGNORE_VRSAVE
-	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
-	 * when testing if we really should do the load/store operation.
-	 */
-	mfcr      r9
-#endif
-
-	/* Try to preload 1st line (where vscr and vrsave are stored) */
-	dcbt      0, r3
-	/* Point to start of general vector-register area             */
-	addi      r3, r3, PPC_CACHE_ALIGNMENT
-	/* Start preloading 2nd line (where first two vectors are)    */
-	dcbt      0, r3
-	L_VSCR_VRSAVE r3, r0, v0
-	CMP_BASES     r3, r4, r8, r6, r10
-	/* Start preloading 3rd line (where vectors 3 and 4 are)      */
-	dcbt      0, r8
-	L_V0TOV19 r3, r4, r8, r6, r10, r11
-
-#ifndef IGNORE_VRSAVE
-	mtcr      r9
-#endif
-	blr
-
-	.global _CPU_Context_switch_altivec
-_CPU_Context_switch_altivec:
-
-	/* fetch offset of altivec area in context                   */
-	CMPOFF    r8
-	/* down-align 'to' area to cache-line boundary               */
-	add       r4, r4, r8
-	CACHE_DOWNALGN r4
-
-	/* Check for PSIM                                            */
-	lis       r6, _CPU_altivec_psim_cpu@ha
-	lwz       r6, _CPU_altivec_psim_cpu@l(r6)
-	cmpli     0, r6, 0
-	bne       1f
-	/* Skip data-stream instructions on PSIM (not implemented)   */
-	dssall
-	/* Pre-load new context into cache                           */
-	lis       r6, (BSIZE<<(24-16)) | (BCNT<<(16-16))
-	ori       r6, r6, BSTRIDE
-	dstt      r4, r6, ds0
-1:
-
-#ifndef IGNORE_VRSAVE
-	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
-	 * when testing if we really should do the load/store operation.
-	 */
-	mfcr      r9
-#endif
-
-	/* Is 'from' context == NULL ? (then we just do a 'restore') */
-	cmpli     0, r3, 0
-	beq       1f           /* yes: skip saving 'from' context    */
-
-	/* SAVE NON-VOLATILE REGISTERS                               */
-
-	/* Compute aligned destination pointer (r8 still holds offset
-	 * to 'altivec' area in context)
-	 */
-	add       r3, r3, r8
-	CACHE_DOWNALGN r3
-
-	PREP_FOR_SAVE r0, r3, r8, r6, r7, r10
-	/* The manual says reading vscr can take some time - do 
-	 * read it here (into a volatile vector register) while
-	 * we wait for cache blocks to be allocated
-	 */
-	mfvscr    v0
-	S_V20TOV31 _LRU=l, _B0=r3, _B1=r8, _B2=r6, _B3=r7, _O1=r10, _O2=r11
-	/* vrsave is now in r0 (PREP_FOR_SAVE), vscr in v0 */
-	S_VSCR_VRSAVE r0, v0, r3, r8
-
-1:
-
-	/* LOAD NON-VOLATILE REGISTERS                               */
-
-	/* Advance past vrsave/vscr area                             */
-	addi      r4, r4, PPC_CACHE_ALIGNMENT
-	L_VSCR_VRSAVE r4, r0, v0
-	CMP_BASES r4, r8, r6, r7, r10
-	L_V20TOV31 r4, r8, r6, r7, r10, r11
-
-#ifndef IGNORE_VRSAVE
-	mtcr      r9
-#endif
-	blr
-
-	.global _CPU_Context_initialize_altivec
-_CPU_Context_initialize_altivec:
-	CMPOFF    r8
-	add       r3, r3, r8
-	CACHE_DOWNALGN r3
-	lis       r8, _CPU_altivec_vrsave_initval@ha
-	lwz       r8, _CPU_altivec_vrsave_initval@l(r8)
-	stw       r8, VRSAVE_OFF(r3)
-	lis       r6, _CPU_altivec_vscr_initval@ha
-	lwz       r6, _CPU_altivec_vscr_initval@l(r6)
-	stw       r6, VSCR_OFF(r3)
-	blr
-
-	/*
-	 * Change the initial value of VRSAVE.
-	 * Can be used by initialization code if
-	 * it is determined that code was compiled
-	 * with -mvrsave=no. In this case, VRSAVE
-	 * must be set to all-ones which causes this
-	 * support code to save/restore *all* registers
-	 * (only has an effect if IGNORE_VRSAVE is
-	 * not defined -- otherwise all registers are
-	 * saved/restored anyways).
-	 */
-	.global _CPU_altivec_set_vrsave_initval
-_CPU_altivec_set_vrsave_initval:
-	lis       r8, _CPU_altivec_vrsave_initval@ha
-	stw       r3, _CPU_altivec_vrsave_initval@l(r8)
-	mtvrsave  r3
-	blr
-
-#ifdef ALTIVEC_TESTING
-	.global msr_VE_on
-msr_VE_on:
-	mfmsr r3
-	oris  r3, r3, 1<<(31-6-16)
-	mtmsr r3
-	blr
-
-	.global msr_VE_off
-msr_VE_off:
-	mfmsr r3
-	lis   r4,  1<<(31-6-16)
-	andc  r3, r3, r4
-	mtmsr r3
-	blr
-
-
-	.global mfvrsave
-mfvrsave:
-	mfvrsave r3
-	blr
-
-	.global mtvrsave
-mtvrsave:
-	mtvrsave r3
-	blr
-
-	/* Load all vector registers from memory area.
-	 * NOTE: This routine is not strictly ABI compliant --
-	 *       it guarantees that volatile vector registers
-	 *       have certain values on exit!
-	 */
-	.global _CPU_altivec_load_all
-_CPU_altivec_load_all:
-	/* Align address up to next cache-line boundary */
-	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
-	CACHE_DOWNALGN r3
-#ifndef IGNORE_VRSAVE
-	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
-	 * when testing if we really should do the load/store operation.
-	 */
-	mfcr      r9
-#endif
-
-	/* Try to preload 1st line (where vscr and vrsave are stored) */
-	dcbt      0, r3
-	/* Point to start of general vector-register area             */
-	addi      r3, r3, PPC_CACHE_ALIGNMENT
-	/* Start preloading 2nd line (where first two vectors are)    */
-	dcbt      0, r3
-	L_VSCR_VRSAVE r3, r0, v0
-	CMP_BASES     r3, r4, r8, r6, r10
-	/* Start preloading 3rd line (where vectors 3 and 4 are)      */
-	dcbt      0, r8
-	L_V0TOV31 r3, r4, r8, r6, r10, r11
-
-#ifndef IGNORE_VRSAVE
-	mtcr      r9
-#endif
-	blr
-
-	.global _CPU_altivec_save_all
-_CPU_altivec_save_all:
-	/* Align address up to next cache-line boundary */
-	addi      r3, r3, PPC_CACHE_ALIGNMENT - 1
-	CACHE_DOWNALGN r3
-
-#ifndef IGNORE_VRSAVE
-	/* Save CRC -- it is used implicitly by all the LOAD/STORE macros
-	 * when testing if we really should do the load/store operation.
-	 */
-	mfcr      r9
-#endif
-
-	PREP_FOR_SAVE r0, r3, r4, r8, r6, r10
-	/* r0 now contains VRSAVE, r3 still the aligned memory area
-	 * and r4, r8, r6 are offset by 16, 32, and 48 bytes from r3,
-	 * respectively. r10 holds zero
-	 */
-	S_V0TOV31     _B0=r3, _B1=r4, _B2=r8, _B3=r6, _O1=r10, _O2=r11
-	mfvscr        v0
-	/* Store vrsave (still in r0) and vscr (in v0) to memory area */
-	S_VSCR_VRSAVE r0, v0, r3, r11
-
-#ifndef IGNORE_VRSAVE
-	/* Restore CRC */
-	mtcr      r9
-#endif
-	blr
-
-
-#if 0
-	.gnu_attribute 4,1
-	.gnu_attribute 8,1
-#endif
-
-#endif
-#endif
-- 
cgit v1.2.3