From 87836607418053b231edf7537a471596212532b6 Mon Sep 17 00:00:00 2001 From: Sebastian Huber Date: Tue, 28 Mar 2017 13:15:04 +0200 Subject: benchmarks/linpack: Import Import linpack sources from: http://www.netlib.org/benchmark/linpack-pc.c Update #2958. --- testsuites/benchmarks/linpack/linpack-pc.c | 1333 ++++++++++++++++++++++++++++ 1 file changed, 1333 insertions(+) create mode 100644 testsuites/benchmarks/linpack/linpack-pc.c diff --git a/testsuites/benchmarks/linpack/linpack-pc.c b/testsuites/benchmarks/linpack/linpack-pc.c new file mode 100644 index 0000000000..a5e68696f1 --- /dev/null +++ b/testsuites/benchmarks/linpack/linpack-pc.c @@ -0,0 +1,1333 @@ +/* + * Linpack 100x100 Benchmark In C/C++ For PCs + * + ******************************************************************** + * + * Original Source from NETLIB + * + * Translated to C by Bonnie Toy 5/88 (modified on 2/25/94 to fix + * a problem with daxpy for unequal increments or equal increments + * not equal to 1. Jack Dongarra) + * + * To obtain rolled source BLAS, add -DROLL to the command lines. + * To obtain unrolled source BLAS, add -DUNROLL to the command lines. + * + * You must specify one of -DSP or -DDP to compile correctly. + * + * You must specify one of -DROLL or -DUNROLL to compile correctly. + * + ******************************************************************** + * + * Changes in this version + * + * 1. Function prototypes are declared and function headers have + * embedded parameter types to produce code for C and C++ + * + * 2. Arrays aa and a are declared as [200*200] and [200*201] to + * allow compilation with prototypes. + * + * 3. Function second changed (compiler dependent). + * + * 4. Timing method changed due to inaccuracy of PC clock (see below). + * + * 5. Additional date function included (compiler dependent). + * + * 6. Additional code used as a standard for a series of benchmarks:- + * Automatic run time calibration rather than fixed parameters + * Initial calibration with display to show linearity + * Results displayed at reasonable rate for viewing (5 seconds) + * Facilities for typing in details of system used etc. + * Compiler details in code in case .exe files used elsewhere + * Results appended to a text file (Linpack.txt) + * + * Roy Longbottom 101323.2241@compuserve.com 14 September 1996 + * + ************************************************************************ + * + * Timing + * + * The PC timer is updated at about 18 times per second or resolution of + * 0.05 to 0.06 seconds which is similar to the time taken by the main + * time consuming function dgefa on a 100 MHz Pentium. Thus there is no + * point in running the dgefa/dges1 combination three times as in the + * original version. Main timing for the latter, in the loop run NTIMES, + * executes matgen/dgefa, summing the time taken by matgen within the + * loop for later deduction from the total time. On a modern PC this sum + * can be based on a random selection of 0 or 0.05/0.06. This version + * executes the single pass once and the main timing loop five times, + * calculating the matgen overhead separately. + * + ************************************************************************* + * + * Example of Output + * + * Rolled Double Precision Linpack Benchmark - PC Version in 'C/C++' + * + * Compiler Watcom C/C++ 10.5 Win 386 + * Optimisation -zp4 -otexan -fp5 -5r -dDP -dROLL + * + * + * norm resid resid machep x[0]-1 x[n-1]-1 + * 0.4 7.41628980e-014 1.00000000e-015 -1.49880108e-014 -1.89848137e-014 + * + * + * Times are reported for matrices of order 100 + * 1 pass times for array with leading dimension of 201 + * + * dgefa dgesl total Mflops unit ratio + * 0.06000 0.00000 0.06000 11.44 0.1748 1.0714 + * + * + * Calculating matgen overhead + * + * 10 times 0.11 seconds + * 20 times 0.22 seconds + * 40 times 0.44 seconds + * 80 times 0.87 seconds + * 160 times 1.76 seconds + * 320 times 3.52 seconds + * 640 times 7.03 seconds + * + * Overhead for 1 matgen 0.01098 seconds + * + * + * Calculating matgen/dgefa passes for 5 seconds + * + * 10 times 0.71 seconds + * 20 times 1.38 seconds + * 40 times 2.80 seconds + * 80 times 5.66 seconds + * + * Passes used 70 + * + * This is followed by output of the normal data for dgefa, dges1, + * total, Mflops, unit and ratio with five sets of results for each. + * + ************************************************************************ + * + * Example from output file Linpack.txt + * + * LINPACK BENCHMARK FOR PCs 'C/C++' n @ 100 + * + * Month run 9/1996 + * PC model Escom + * CPU Pentium + * Clock MHz 100 + * Cache 256K + * Options Neptune chipset + * OS/DOS Windows 95 + * Compiler Watcom C/C++ 10.5 Win 386 + * OptLevel -zp4 -otexan -fp5 -5r -dDP -dROLL + * Run by Roy Longbottom + * From UK + * Mail 101323.2241@compuserve.com + * + * Rolling Rolled + * Precision Double + * norm. resid 0.4 + * resid 7.41628980e-014 + * machep 1.00000000e-015 (8.88178420e-016 NON OPT) + * x[0]-1 -1.49880108e-014 + * x[n-1]-1 -1.89848137e-014 + * matgen 1 seconds 0.01051 + * matgen 2 seconds 0.01050 + * Repetitions 70 + * Leading dimension 201 + * dgefa dgesl total Mflops + * 1 pass seconds 0.06000 0.00000 0.06000 + * Repeat seconds 0.06092 0.00157 0.06249 10.99 + * Repeat seconds 0.06077 0.00157 0.06234 11.01 + * Repeat seconds 0.06092 0.00157 0.06249 10.99 + * Repeat seconds 0.06092 0.00157 0.06249 10.99 + * Repeat seconds 0.06092 0.00157 0.06249 10.99 + * Average 10.99 + * Leading dimension 200 + * Repeat seconds 0.05936 0.00157 0.06093 11.27 + * Repeat seconds 0.05936 0.00157 0.06093 11.27 + * Repeat seconds 0.05864 0.00157 0.06021 11.40 + * Repeat seconds 0.05936 0.00157 0.06093 11.27 + * Repeat seconds 0.05864 0.00157 0.06021 11.40 + * Average 11.32 + * + ************************************************************************ + * + * Examples of Results + * + * Precompiled codes were produced via a Watcom C/C++ 10.5 compiler. + * Versions are available for DOS, Windows 3/95 and NT/Win 95. Both + * non-optimised and optimised programs are available. The latter has + * options as in the above example. Although these options can place + * functions in-line, in this case, daxpy is not in-lined. Optimisation + * reduces 18 instructions in the loop in this function to the following: + * + * L85 fld st(0) + * fmul qword ptr [edx] + * add eax,00000008H + * add edx,00000008H + * fadd qword ptr -8H[eax] + * inc ebx + * fstp qword ptr -8H[eax] + * cmp ebx,esi + * jl L85 + * + * Results produced are not consistent between runs but produce similar + * speeds when executing at a particular dimension (see above). An example + * of other results is 11.4/10.5 Mflops. Most typical double precision + * rolled results are: + * + * Opt No Opt Version/ + * MHz Cache Mflops Mflops Make/Options Via + * + * AM80386DX 40 128K 0.53 0.36 Clone Win/W95 + * 80486DX2 66 128K 2.5 1.9 Escom SIS chipset Win/W95 + * 80486DX2 66 128K 2.3 1.9 Escom SIS chipset NT/W95 + * 80486DX2 66 128K 2.8 2.0 Escom SIS chipset Dos/Dos + * Pentium 100 256K 11 4.2 Escom Neptune chipset Win/W95 + * Pentium 100 256K 11 5.5 Escom Neptune chipset NT/W95 + * Pentium 100 256K 12 4.4 Escom Neptune chipset Dos/Dos + * Pentium Pro 200 256K 48 19 Dell XPS Pro200n NT/NT + * + * The results are as produced when compiled as Linpack.cpp. Compiling as + * Linpack.c gives similar speeds but the code is a little different. + * + *************************************************************************** +*/ + + +#ifdef SP +#define REAL float +#define ZERO 0.0 +#define ONE 1.0 +#define PREC "Single " +#endif + +#ifdef DP +#define REAL double +#define ZERO 0.0e0 +#define ONE 1.0e0 +#define PREC "Double " +#endif + +#ifdef ROLL +#define ROLLING "Rolled " +#endif +#ifdef UNROLL +#define ROLLING "Unrolled " +#endif + + +#define NTIMES 10 + +#include +#include +#include +#include + + +static REAL atime[9][15]; +static char this_month; +static int this_year; + +void print_time (int row); +void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma); +void dgefa (REAL a[], int lda, int n, int ipvt[], int *info); +void dgesl (REAL a[],int lda,int n,int ipvt[],REAL b[],int job); +void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]); +void daxpy (int n, REAL da, REAL dx[], int incx, REAL dy[], int incy); +REAL epslon (REAL x); +int idamax (int n, REAL dx[], int incx); +void dscal (int n, REAL da, REAL dx[], int incx); +REAL ddot (int n, REAL dx[], int incx, REAL dy[], int incy); + +/* TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME TIME */ + #include /* for following time functions only */ + REAL second() + { + REAL secs; + clock_t Time; + Time = clock(); + secs = (REAL)Time / (REAL)CLOCKS_PER_SEC; + return secs ; + } + +/* DATE DATE DATE DATE DATE DATE DATE DATE DATE DATE DATE DATE DATE */ + #include /* for following date functions only */ + void what_date() + { + /* Watcom */ + struct dosdate_t adate; + _dos_getdate( &adate ); + this_month = adate.month; + this_year = adate.year; + + /* Borland + struct date adate; + getdate( &adate ); + this_month = adate.da_mon; + this_year = adate.da_year; + */ + return; + } + + +main () +{ + static REAL aa[200*200],a[200*201],b[200],x[200]; + REAL cray,ops,total,norma,normx; + REAL resid,residn,eps,t1,tm2,epsn,x1,x2; + REAL mflops; + static int ipvt[200],n,i,j,ntimes,info,lda,ldaa; + int Endit, pass, loop; + REAL overhead1, overhead2, time1, time2; + FILE *outfile; + char *compiler, *options, general[9][80] = {" "}; + + outfile = fopen("Linpack.txt","a+"); + if (outfile == NULL) + { + printf ("Cannot open results file \n\n"); + printf("Press any key\n"); + Endit = getch(); + exit (0); + } + +/************************************************************************ + * Enter details of compiler and options used * + ************************************************************************/ + /*----------------- --------- --------- ---------*/ + compiler = "INSERT COMPILER NAME HERE"; + options = "INSERT OPTIMISATION OPTIONS HERE"; + /* Include -dDP or -dSP and -dROLL or -dUNROLL */ + + lda = 201; + ldaa = 200; + cray = .056; + n = 100; + + fprintf(stdout,ROLLING);fprintf(stdout,PREC); + fprintf(stdout,"Precision Linpack Benchmark - PC Version in 'C/C++'\n\n"); + fprintf(stdout,"Compiler %s\n",compiler); + fprintf(stdout,"Optimisation %s\n\n",options); + + ops = (2.0e0*(n*n*n))/3.0 + 2.0*(n*n); + + matgen(a,lda,n,b,&norma); + t1 = second(); + dgefa(a,lda,n,ipvt,&info); + atime[0][0] = second() - t1; + t1 = second(); + dgesl(a,lda,n,ipvt,b,0); + atime[1][0] = second() - t1; + total = atime[0][0] + atime[1][0]; + +/* compute a residual to verify results. */ + + for (i = 0; i < n; i++) { + x[i] = b[i]; + } + matgen(a,lda,n,b,&norma); + for (i = 0; i < n; i++) { + b[i] = -b[i]; + } + dmxpy(n,b,n,lda,x,a); + resid = 0.0; + normx = 0.0; + for (i = 0; i < n; i++) { + resid = (resid > fabs((double)b[i])) + ? resid : fabs((double)b[i]); + normx = (normx > fabs((double)x[i])) + ? normx : fabs((double)x[i]); + } + eps = epslon(ONE); + residn = resid/( n*norma*normx*eps ); + epsn = eps; + x1 = x[0] - 1; + x2 = x[n-1] - 1; + + printf("norm resid resid machep"); + printf(" x[0]-1 x[n-1]-1\n"); + printf("%6.1f %17.8e%17.8e%17.8e%17.8e\n\n", + (double)residn, (double)resid, (double)epsn, + (double)x1, (double)x2); + + fprintf(stderr,"Times are reported for matrices of order %5d\n",n); + fprintf(stderr,"1 pass times for array with leading dimension of%5d\n\n",lda); + fprintf(stderr," dgefa dgesl total Mflops unit"); + fprintf(stderr," ratio\n"); + + atime[2][0] = total; + if (total > 0.0) + { + atime[3][0] = ops/(1.0e6*total); + atime[4][0] = 2.0/atime[3][0]; + } + else + { + atime[3][0] = 0.0; + atime[4][0] = 0.0; + } + atime[5][0] = total/cray; + + print_time(0); + +/************************************************************************ + * Calculate overhead of executing matgen procedure * + ************************************************************************/ + + fprintf (stderr,"\nCalculating matgen overhead\n"); + pass = -20; + loop = NTIMES; + do + { + time1 = second(); + pass = pass + 1; + for ( i = 0 ; i < loop ; i++) + { + matgen(a,lda,n,b,&norma); + } + time2 = second(); + overhead1 = (time2 - time1); + fprintf (stderr,"%10d times %6.2f seconds\n", loop, overhead1); + if (overhead1 > 5.0) + { + pass = 0; + } + if (pass < 0) + { + if (overhead1 < 0.1) + { + loop = loop * 10; + } + else + { + loop = loop * 2; + } + } + } + while (pass < 0); + + overhead1 = overhead1 / (double)loop; + + fprintf (stderr,"Overhead for 1 matgen %12.5f seconds\n\n", overhead1); + +/************************************************************************ + * Calculate matgen/dgefa passes for 5 seconds * + ************************************************************************/ + + fprintf (stderr,"Calculating matgen/dgefa passes for 5 seconds\n"); + pass = -20; + ntimes = NTIMES; + do + { + time1 = second(); + pass = pass + 1; + for ( i = 0 ; i < ntimes ; i++) + { + matgen(a,lda,n,b,&norma); + dgefa(a,lda,n,ipvt,&info ); + } + time2 = second() - time1; + fprintf (stderr,"%10d times %6.2f seconds\n", ntimes, time2); + if (time2 > 5.0) + { + pass = 0; + } + if (pass < 0) + { + if (time2 < 0.1) + { + ntimes = ntimes * 10; + } + else + { + ntimes = ntimes * 2; + } + } + } + while (pass < 0); + + ntimes = 5.0 * (double)ntimes / time2; + if (ntimes == 0) ntimes = 1; + + fprintf (stderr,"Passes used %10d \n\n", ntimes); + fprintf(stderr,"Times for array with leading dimension of%4d\n\n",lda); + fprintf(stderr," dgefa dgesl total Mflops unit"); + fprintf(stderr," ratio\n"); + +/************************************************************************ + * Execute 5 passes * + ************************************************************************/ + + tm2 = ntimes * overhead1; + atime[3][6] = 0; + + for (j=1 ; j<6 ; j++) + { + + t1 = second(); + + for (i = 0; i < ntimes; i++) + { + matgen(a,lda,n,b,&norma); + dgefa(a,lda,n,ipvt,&info ); + } + + atime[0][j] = (second() - t1 - tm2)/ntimes; + + t1 = second(); + + for (i = 0; i < ntimes; i++) + { + dgesl(a,lda,n,ipvt,b,0); + } + + atime[1][j] = (second() - t1)/ntimes; + total = atime[0][j] + atime[1][j]; + atime[2][j] = total; + atime[3][j] = ops/(1.0e6*total); + atime[4][j] = 2.0/atime[3][j]; + atime[5][j] = total/cray; + atime[3][6] = atime[3][6] + atime[3][j]; + + print_time(j); + } + atime[3][6] = atime[3][6] / 5.0; + fprintf (stderr,"Average %11.2f\n", + (double)atime[3][6]); + + fprintf (stderr,"\nCalculating matgen2 overhead\n"); + +/************************************************************************ + * Calculate overhead of executing matgen procedure * + ************************************************************************/ + + time1 = second(); + for ( i = 0 ; i < loop ; i++) + { + matgen(aa,ldaa,n,b,&norma); + } + time2 = second(); + overhead2 = (time2 - time1); + overhead2 = overhead2 / (double)loop; + + fprintf (stderr,"Overhead for 1 matgen %12.5f seconds\n\n", overhead2); + fprintf(stderr,"Times for array with leading dimension of%4d\n\n",ldaa); + fprintf(stderr," dgefa dgesl total Mflops unit"); + fprintf(stderr," ratio\n"); + +/************************************************************************ + * Execute 5 passes * + ************************************************************************/ + + tm2 = ntimes * overhead2; + atime[3][12] = 0; + + for (j=7 ; j<12 ; j++) + { + + t1 = second(); + + for (i = 0; i < ntimes; i++) + { + matgen(aa,ldaa,n,b,&norma); + dgefa(aa,ldaa,n,ipvt,&info ); + } + + atime[0][j] = (second() - t1 - tm2)/ntimes; + + t1 = second(); + + for (i = 0; i < ntimes; i++) + { + dgesl(aa,ldaa,n,ipvt,b,0); + } + + atime[1][j] = (second() - t1)/ntimes; + total = atime[0][j] + atime[1][j]; + atime[2][j] = total; + atime[3][j] = ops/(1.0e6*total); + atime[4][j] = 2.0/atime[3][j]; + atime[5][j] = total/cray; + atime[3][12] = atime[3][12] + atime[3][j]; + + print_time(j); + } + atime[3][12] = atime[3][12] / 5.0; + fprintf (stderr,"Average %11.2f\n", + (double)atime[3][12]); + +/************************************************************************ + * Use minimum average as overall Mflops rating * + ************************************************************************/ + + mflops = atime[3][6]; + if (atime[3][12] < mflops) mflops = atime[3][12]; + + fprintf(stderr,"\n"); + fprintf(stderr,ROLLING);fprintf(stderr,PREC); + fprintf(stderr," Precision %11.2f Mflops \n\n",mflops); + + what_date(); + +/************************************************************************ + * Type details of hardware, software etc. * + ************************************************************************/ + + printf ("Enter the following data which will be " + "appended to file Linpack.txt \n\n"); + printf ("PC Supplier/model ?\n "); + scanf ("%[^\n]", general[1]); + fflush (stdin); + printf ("CPU ?\n "); + scanf ("%[^\n]", general[2]); + fflush (stdin); + printf ("Clock MHz ?\n "); + scanf ("%[^\n]", general[3]); + fflush (stdin); + printf ("Cache ?\n "); + scanf ("%[^\n]", general[4]); + fflush (stdin); + printf ("Chipset/options ?\n "); + scanf ("%[^\n]", general[5]); + fflush (stdin); + printf ("OS/DOS version ?\n "); + scanf ("%[^\n]", general[6]); + fflush (stdin); + printf ("Your name ?\n "); + scanf ("%[^\n]", general[7]); + fflush (stdin); + printf ("Where from ?\n "); + scanf ("%[^\n]", general[8]); + fflush (stdin); + printf ("Mail address ?\n "); + scanf ("%[^\n]", general[0]); + fflush (stdin); + +/************************************************************************ + * Add results to output file LLloops.txt * + ************************************************************************/ + + fprintf (outfile, "----------------- ----------------- --------- " + "--------- ---------\n"); + fprintf (outfile, "LINPACK BENCHMARK FOR PCs 'C/C++' n @ 100\n\n"); + fprintf (outfile, "Month run %d/%d\n", this_month, this_year); + fprintf (outfile, "PC model %s\n", general[1]); + fprintf (outfile, "CPU %s\n", general[2]); + fprintf (outfile, "Clock MHz %s\n", general[3]); + fprintf (outfile, "Cache %s\n", general[4]); + fprintf (outfile, "Options %s\n", general[5]); + fprintf (outfile, "OS/DOS %s\n", general[6]); + fprintf (outfile, "Compiler %s\n", compiler); + fprintf (outfile, "OptLevel %s\n", options); + fprintf (outfile, "Run by %s\n", general[7]); + fprintf (outfile, "From %s\n", general[8]); + fprintf (outfile, "Mail %s\n\n", general[0]); + + fprintf(outfile, "Rolling %s\n",ROLLING); + fprintf(outfile, "Precision %s\n",PREC); + fprintf(outfile, "norm. resid %16.1f\n",(double)residn); + fprintf(outfile, "resid %16.8e\n",(double)resid); + fprintf(outfile, "machep %16.8e\n",(double)epsn); + fprintf(outfile, "x[0]-1 %16.8e\n",(double)x1); + fprintf(outfile, "x[n-1]-1 %16.8e\n",(double)x2); + fprintf(outfile, "matgen 1 seconds %16.5f\n",overhead1); + fprintf(outfile, "matgen 2 seconds %16.5f\n",overhead2); + fprintf(outfile, "Repetitions %16d\n",ntimes); + fprintf(outfile, "Leading dimension %16d\n",lda); + fprintf(outfile, " dgefa dgesl " + " total Mflops\n"); + fprintf(outfile, "1 pass seconds %16.5f %9.5f %9.5f\n", + atime[0][0], atime[1][0], atime[2][0]); + + for (i=1 ; i<6 ; i++) + { + fprintf(outfile, "Repeat seconds %16.5f %9.5f %9.5f %9.2f\n", + atime[0][i], atime[1][i], atime[2][i], atime[3][i]); + } + fprintf(outfile, "Average %46.2f\n",atime[3][6]); + + fprintf(outfile, "Leading dimension %16d\n",ldaa); + + for (i=7 ; i<12 ; i++) + { + fprintf(outfile, "Repeat seconds %16.5f %9.5f %9.5f %9.2f\n", + atime[0][i], atime[1][i], atime[2][i], atime[3][i]); + } + fprintf(outfile, "Average %46.2f\n\n",atime[3][12]); + + fclose (outfile); + + printf("\nPress any key\n"); + Endit = getch(); +} + +/*----------------------*/ +void print_time (int row) + +{ +fprintf(stderr,"%11.5f%11.5f%11.5f%11.2f%11.4f%11.4f\n", (double)atime[0][row], + (double)atime[1][row], (double)atime[2][row], (double)atime[3][row], + (double)atime[4][row], (double)atime[5][row]); + return; +} + +/*----------------------*/ + +void matgen (REAL a[], int lda, int n, REAL b[], REAL *norma) + + +/* We would like to declare a[][lda], but c does not allow it. In this +function, references to a[i][j] are written a[lda*i+j]. */ + +{ + int init, i, j; + + init = 1325; + *norma = 0.0; + for (j = 0; j < n; j++) { + for (i = 0; i < n; i++) { + init = 3125*init % 65536; + a[lda*j+i] = (init - 32768.0)/16384.0; + *norma = (a[lda*j+i] > *norma) ? a[lda*j+i] : *norma; + + /* alternative for some compilers + if (fabs(a[lda*j+i]) > *norma) *norma = fabs(a[lda*j+i]); + */ + } + } + for (i = 0; i < n; i++) { + b[i] = 0.0; + } + for (j = 0; j < n; j++) { + for (i = 0; i < n; i++) { + b[i] = b[i] + a[lda*j+i]; + } + } + return; +} + +/*----------------------*/ +void dgefa(REAL a[], int lda, int n, int ipvt[], int *info) + + +/* We would like to declare a[][lda], but c does not allow it. In this +function, references to a[i][j] are written a[lda*i+j]. */ +/* + dgefa factors a double precision matrix by gaussian elimination. + + dgefa is usually called by dgeco, but it can be called + directly with a saving in time if rcond is not needed. + (time for dgeco) = (1 + 9/n)*(time for dgefa) . + + on entry + + a REAL precision[n][lda] + the matrix to be factored. + + lda integer + the leading dimension of the array a . + + n integer + the order of the matrix a . + + on return + + a an upper triangular matrix and the multipliers + which were used to obtain it. + the factorization can be written a = l*u where + l is a product of permutation and unit lower + triangular matrices and u is upper triangular. + + ipvt integer[n] + an integer vector of pivot indices. + + info integer + = 0 normal value. + = k if u[k][k] .eq. 0.0 . this is not an error + condition for this subroutine, but it does + indicate that dgesl or dgedi will divide by zero + if called. use rcond in dgeco for a reliable + indication of singularity. + + linpack. this version dated 08/14/78 . + cleve moler, university of new mexico, argonne national lab. + + functions + + blas daxpy,dscal,idamax +*/ + +{ +/* internal variables */ + +REAL t; +int j,k,kp1,l,nm1; + + +/* gaussian elimination with partial pivoting */ + + *info = 0; + nm1 = n - 1; + if (nm1 >= 0) { + for (k = 0; k < nm1; k++) { + kp1 = k + 1; + + /* find l = pivot index */ + + l = idamax(n-k,&a[lda*k+k],1) + k; + ipvt[k] = l; + + /* zero pivot implies this column already + triangularized */ + + if (a[lda*k+l] != ZERO) { + + /* interchange if necessary */ + + if (l != k) { + t = a[lda*k+l]; + a[lda*k+l] = a[lda*k+k]; + a[lda*k+k] = t; + } + + /* compute multipliers */ + + t = -ONE/a[lda*k+k]; + dscal(n-(k+1),t,&a[lda*k+k+1],1); + + /* row elimination with column indexing */ + + for (j = kp1; j < n; j++) { + t = a[lda*j+l]; + if (l != k) { + a[lda*j+l] = a[lda*j+k]; + a[lda*j+k] = t; + } + daxpy(n-(k+1),t,&a[lda*k+k+1],1, + &a[lda*j+k+1],1); + } + } + else { + *info = k; + } + } + } + ipvt[n-1] = n-1; + if (a[lda*(n-1)+(n-1)] == ZERO) *info = n-1; + return; +} + +/*----------------------*/ + +void dgesl(REAL a[],int lda,int n,int ipvt[],REAL b[],int job ) + + +/* We would like to declare a[][lda], but c does not allow it. In this +function, references to a[i][j] are written a[lda*i+j]. */ + +/* + dgesl solves the double precision system + a * x = b or trans(a) * x = b + using the factors computed by dgeco or dgefa. + + on entry + + a double precision[n][lda] + the output from dgeco or dgefa. + + lda integer + the leading dimension of the array a . + + n integer + the order of the matrix a . + + ipvt integer[n] + the pivot vector from dgeco or dgefa. + + b double precision[n] + the right hand side vector. + + job integer + = 0 to solve a*x = b , + = nonzero to solve trans(a)*x = b where + trans(a) is the transpose. + + on return + + b the solution vector x . + + error condition + + a division by zero will occur if the input factor contains a + zero on the diagonal. technically this indicates singularity + but it is often caused by improper arguments or improper + setting of lda . it will not occur if the subroutines are + called correctly and if dgeco has set rcond .gt. 0.0 + or dgefa has set info .eq. 0 . + + to compute inverse(a) * c where c is a matrix + with p columns + dgeco(a,lda,n,ipvt,rcond,z) + if (!rcond is too small){ + for (j=0,j= 1) { + for (k = 0; k < nm1; k++) { + l = ipvt[k]; + t = b[l]; + if (l != k){ + b[l] = b[k]; + b[k] = t; + } + daxpy(n-(k+1),t,&a[lda*k+k+1],1,&b[k+1],1 ); + } + } + + /* now solve u*x = y */ + + for (kb = 0; kb < n; kb++) { + k = n - (kb + 1); + b[k] = b[k]/a[lda*k+k]; + t = -b[k]; + daxpy(k,t,&a[lda*k+0],1,&b[0],1 ); + } + } + else { + + /* job = nonzero, solve trans(a) * x = b + first solve trans(u)*y = b */ + + for (k = 0; k < n; k++) { + t = ddot(k,&a[lda*k+0],1,&b[0],1); + b[k] = (b[k] - t)/a[lda*k+k]; + } + + /* now solve trans(l)*x = y */ + + if (nm1 >= 1) { + for (kb = 1; kb < nm1; kb++) { + k = n - (kb+1); + b[k] = b[k] + ddot(n-(k+1),&a[lda*k+k+1],1,&b[k+1],1); + l = ipvt[k]; + if (l != k) { + t = b[l]; + b[l] = b[k]; + b[k] = t; + } + } + } + } + return; +} + +/*----------------------*/ + +void daxpy(int n, REAL da, REAL dx[], int incx, REAL dy[], int incy) +/* + constant times a vector plus a vector. + jack dongarra, linpack, 3/11/78. +*/ + +{ + int i,ix,iy,m,mp1; + + mp1 = 0; + m = 0; + + if(n <= 0) return; + if (da == ZERO) return; + + if(incx != 1 || incy != 1) { + + /* code for unequal increments or equal increments + not equal to 1 */ + + ix = 0; + iy = 0; + if(incx < 0) ix = (-n+1)*incx; + if(incy < 0)iy = (-n+1)*incy; + for (i = 0;i < n; i++) { + dy[iy] = dy[iy] + da*dx[ix]; + ix = ix + incx; + iy = iy + incy; + + } + return; + } + + /* code for both increments equal to 1 */ + + +#ifdef ROLL + + for (i = 0;i < n; i++) { + dy[i] = dy[i] + da*dx[i]; + } + + +#endif + +#ifdef UNROLL + + m = n % 4; + if ( m != 0) { + for (i = 0; i < m; i++) + dy[i] = dy[i] + da*dx[i]; + + if (n < 4) return; + } + for (i = m; i < n; i = i + 4) { + dy[i] = dy[i] + da*dx[i]; + dy[i+1] = dy[i+1] + da*dx[i+1]; + dy[i+2] = dy[i+2] + da*dx[i+2]; + dy[i+3] = dy[i+3] + da*dx[i+3]; + + } + +#endif +return; +} + +/*----------------------*/ + +REAL ddot(int n, REAL dx[], int incx, REAL dy[], int incy) +/* + forms the dot product of two vectors. + jack dongarra, linpack, 3/11/78. +*/ + +{ + REAL dtemp; + int i,ix,iy,m,mp1; + + mp1 = 0; + m = 0; + + dtemp = ZERO; + + if(n <= 0) return(ZERO); + + if(incx != 1 || incy != 1) { + + /* code for unequal increments or equal increments + not equal to 1 */ + + ix = 0; + iy = 0; + if (incx < 0) ix = (-n+1)*incx; + if (incy < 0) iy = (-n+1)*incy; + for (i = 0;i < n; i++) { + dtemp = dtemp + dx[ix]*dy[iy]; + ix = ix + incx; + iy = iy + incy; + + } + return(dtemp); + } + + /* code for both increments equal to 1 */ + + +#ifdef ROLL + + for (i=0;i < n; i++) + dtemp = dtemp + dx[i]*dy[i]; + + return(dtemp); + +#endif + +#ifdef UNROLL + + + m = n % 5; + if (m != 0) { + for (i = 0; i < m; i++) + dtemp = dtemp + dx[i]*dy[i]; + if (n < 5) return(dtemp); + } + for (i = m; i < n; i = i + 5) { + dtemp = dtemp + dx[i]*dy[i] + + dx[i+1]*dy[i+1] + dx[i+2]*dy[i+2] + + dx[i+3]*dy[i+3] + dx[i+4]*dy[i+4]; + } + return(dtemp); + +#endif + +} + +/*----------------------*/ +void dscal(int n, REAL da, REAL dx[], int incx) + +/* scales a vector by a constant. + jack dongarra, linpack, 3/11/78. +*/ + +{ + int i,m,mp1,nincx; + + mp1 = 0; + m = 0; + + if(n <= 0)return; + if(incx != 1) { + + /* code for increment not equal to 1 */ + + nincx = n*incx; + for (i = 0; i < nincx; i = i + incx) + dx[i] = da*dx[i]; + + return; + } + + /* code for increment equal to 1 */ + + +#ifdef ROLL + + for (i = 0; i < n; i++) + dx[i] = da*dx[i]; + + +#endif + +#ifdef UNROLL + + + m = n % 5; + if (m != 0) { + for (i = 0; i < m; i++) + dx[i] = da*dx[i]; + if (n < 5) return; + } + for (i = m; i < n; i = i + 5){ + dx[i] = da*dx[i]; + dx[i+1] = da*dx[i+1]; + dx[i+2] = da*dx[i+2]; + dx[i+3] = da*dx[i+3]; + dx[i+4] = da*dx[i+4]; + } + +#endif + +} + +/*----------------------*/ +int idamax(int n, REAL dx[], int incx) + +/* + finds the index of element having max. absolute value. + jack dongarra, linpack, 3/11/78. +*/ + + +{ + REAL dmax; + int i, ix, itemp; + + if( n < 1 ) return(-1); + if(n ==1 ) return(0); + if(incx != 1) { + + /* code for increment not equal to 1 */ + + ix = 1; + dmax = fabs((double)dx[0]); + ix = ix + incx; + for (i = 1; i < n; i++) { + if(fabs((double)dx[ix]) > dmax) { + itemp = i; + dmax = fabs((double)dx[ix]); + } + ix = ix + incx; + } + } + else { + + /* code for increment equal to 1 */ + + itemp = 0; + dmax = fabs((double)dx[0]); + for (i = 1; i < n; i++) { + if(fabs((double)dx[i]) > dmax) { + itemp = i; + dmax = fabs((double)dx[i]); + } + } + } + return (itemp); +} + +/*----------------------*/ +REAL epslon (REAL x) + +/* + estimate unit roundoff in quantities of size x. +*/ + +{ + REAL a,b,c,eps; +/* + this program should function properly on all systems + satisfying the following two assumptions, + 1. the base used in representing dfloating point + numbers is not a power of three. + 2. the quantity a in statement 10 is represented to + the accuracy used in dfloating point variables + that are stored in memory. + the statement number 10 and the go to 10 are intended to + force optimizing compilers to generate code satisfying + assumption 2. + under these assumptions, it should be true that, + a is not exactly equal to four-thirds, + b has a zero for its last bit or digit, + c is not exactly equal to one, + eps measures the separation of 1.0 from + the next larger dfloating point number. + the developers of eispack would appreciate being informed + about any systems where these assumptions do not hold. + + ***************************************************************** + this routine is one of the auxiliary routines used by eispack iii + to avoid machine dependencies. + ***************************************************************** + + this version dated 4/6/83. +*/ + + a = 4.0e0/3.0e0; + eps = ZERO; + while (eps == ZERO) { + b = a - ONE; + c = b + b + b; + eps = fabs((double)(c-ONE)); + } + return(eps*fabs((double)x)); +} + +/*----------------------*/ +void dmxpy (int n1, REAL y[], int n2, int ldm, REAL x[], REAL m[]) + + +/* We would like to declare m[][ldm], but c does not allow it. In this +function, references to m[i][j] are written m[ldm*i+j]. */ + +/* + purpose: + multiply matrix m times vector x and add the result to vector y. + + parameters: + + n1 integer, number of elements in vector y, and number of rows in + matrix m + + y double [n1], vector of length n1 to which is added + the product m*x + + n2 integer, number of elements in vector x, and number of columns + in matrix m + + ldm integer, leading dimension of array m + + x double [n2], vector of length n2 + + m double [ldm][n2], matrix of n1 rows and n2 columns + + ---------------------------------------------------------------------- +*/ +{ + int j,i,jmin; + /* cleanup odd vector */ + + j = n2 % 2; + if (j >= 1) { + j = j - 1; + for (i = 0; i < n1; i++) + y[i] = (y[i]) + x[j]*m[ldm*j+i]; + } + + /* cleanup odd group of two vectors */ + + j = n2 % 4; + if (j >= 2) { + j = j - 1; + for (i = 0; i < n1; i++) + y[i] = ( (y[i]) + + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i]; + } + + /* cleanup odd group of four vectors */ + + j = n2 % 8; + if (j >= 4) { + j = j - 1; + for (i = 0; i < n1; i++) + y[i] = ((( (y[i]) + + x[j-3]*m[ldm*(j-3)+i]) + + x[j-2]*m[ldm*(j-2)+i]) + + x[j-1]*m[ldm*(j-1)+i]) + x[j]*m[ldm*j+i]; + } + + /* cleanup odd group of eight vectors */ + + j = n2 % 16; + if (j >= 8) { + j = j - 1; + for (i = 0; i < n1; i++) + y[i] = ((((((( (y[i]) + + x[j-7]*m[ldm*(j-7)+i]) + x[j-6]*m[ldm*(j-6)+i]) + + x[j-5]*m[ldm*(j-5)+i]) + x[j-4]*m[ldm*(j-4)+i]) + + x[j-3]*m[ldm*(j-3)+i]) + x[j-2]*m[ldm*(j-2)+i]) + + x[j-1]*m[ldm*(j-1)+i]) + x[j] *m[ldm*j+i]; + } + + /* main loop - groups of sixteen vectors */ + + jmin = (n2%16)+16; + for (j = jmin-1; j < n2; j = j + 16) { + for (i = 0; i < n1; i++) + y[i] = ((((((((((((((( (y[i]) + + x[j-15]*m[ldm*(j-15)+i]) + + x[j-14]*m[ldm*(j-14)+i]) + + x[j-13]*m[ldm*(j-13)+i]) + + x[j-12]*m[ldm*(j-12)+i]) + + x[j-11]*m[ldm*(j-11)+i]) + + x[j-10]*m[ldm*(j-10)+i]) + + x[j- 9]*m[ldm*(j- 9)+i]) + + x[j- 8]*m[ldm*(j- 8)+i]) + + x[j- 7]*m[ldm*(j- 7)+i]) + + x[j- 6]*m[ldm*(j- 6)+i]) + + x[j- 5]*m[ldm*(j- 5)+i]) + + x[j- 4]*m[ldm*(j- 4)+i]) + + x[j- 3]*m[ldm*(j- 3)+i]) + + x[j- 2]*m[ldm*(j- 2)+i]) + + x[j- 1]*m[ldm*(j- 1)+i]) + + x[j] *m[ldm*j+i]; + } + return; +} + +/*----------------------*/ -- cgit v1.2.3