Refactor to wrap specific implementation

kgryte · kgryte · commit f075fe14a9bc · 2020-06-15T23:42:44.000-07:00
diff --git a/lib/node_modules/@stdlib/blas/ext/base/dsapxsum/lib/dsapxsum.js b/lib/node_modules/@stdlib/blas/ext/base/dsapxsum/lib/dsapxsum.js
@@ -20,22 +20,14 @@
 
 // MODULES //
 
-var sum = require( './ndarray.js' );
+var dsapxsumpw = require( '@stdlib/blas/ext/base/dsapxsumpw' );
 
 
 // MAIN //
 
 /**
 * Adds a constant to each single-precision floating-point strided array element and computes the sum using extended accumulation and returning an extended precision result.
 *
-* ## Method
-*
-* -   This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
-*
-* ## References
-*
-* -   Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
-*
 * @param {PositiveInteger} N - number of indexed elements
 * @param {number} alpha - constant
 * @param {Float32Array} x - input array
@@ -52,31 +44,7 @@ var sum = require( './ndarray.js' );
 * // returns 16.0
 */
 function dsapxsum( N, alpha, x, stride ) {
-	var ix;
-	var s;
-	var i;
-
-	if ( N <= 0 ) {
-		return 0.0;
-	}
-	if ( N === 1 || stride === 0 ) {
-		return alpha + x[ 0 ];
-	}
-	if ( stride < 0 ) {
-		ix = (1-N) * stride;
-	} else {
-		ix = 0;
-	}
-	if ( N < 8 ) {
-		// Use simple summation...
-		s = 0.0;
-		for ( i = 0; i < N; i++ ) {
-			s += alpha + x[ ix ];
-			ix += stride;
-		}
-		return s;
-	}
-	return sum( N, alpha, x, stride, ix );
+	return dsapxsumpw( N, alpha, x, stride );
 }
 
 
diff --git a/lib/node_modules/@stdlib/blas/ext/base/dsapxsum/lib/ndarray.js b/lib/node_modules/@stdlib/blas/ext/base/dsapxsum/lib/ndarray.js
@@ -20,28 +20,14 @@
 
 // MODULES //
 
-var floor = require( '@stdlib/math/base/special/floor' );
-
-
-// VARIABLES //
-
-// Blocksize for pairwise summation (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.):
-var BLOCKSIZE = 128;
+var dsapxsumpw = require( '@stdlib/blas/ext/base/dsapxsumpw' ).ndarray;
 
 
 // MAIN //
 
 /**
 * Adds a constant to each single-precision floating-point strided array element and computes the sum using extended accumulation and returning an extended precision result.
 *
-* ## Method
-*
-* -   This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
-*
-* ## References
-*
-* -   Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
-*
 * @param {PositiveInteger} N - number of indexed elements
 * @param {number} alpha - constant
 * @param {Float32Array} x - input array
@@ -60,74 +46,7 @@ var BLOCKSIZE = 128;
 * // returns 25.0
 */
 function dsapxsum( N, alpha, x, stride, offset ) {
-	var ix;
-	var s0;
-	var s1;
-	var s2;
-	var s3;
-	var s4;
-	var s5;
-	var s6;
-	var s7;
-	var M;
-	var s;
-	var n;
-	var i;
-
-	if ( N <= 0 ) {
-		return 0.0;
-	}
-	if ( N === 1 || stride === 0 ) {
-		return alpha + x[ offset ];
-	}
-	ix = offset;
-	if ( N < 8 ) {
-		// Use simple summation...
-		s = 0.0;
-		for ( i = 0; i < N; i++ ) {
-			s += alpha + x[ ix ];
-			ix += stride;
-		}
-		return s;
-	}
-	if ( N <= BLOCKSIZE ) {
-		// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
-		s0 = alpha + x[ ix ];
-		s1 = alpha + x[ ix+stride ];
-		s2 = alpha + x[ ix+(2*stride) ];
-		s3 = alpha + x[ ix+(3*stride) ];
-		s4 = alpha + x[ ix+(4*stride) ];
-		s5 = alpha + x[ ix+(5*stride) ];
-		s6 = alpha + x[ ix+(6*stride) ];
-		s7 = alpha + x[ ix+(7*stride) ];
-		ix += 8 * stride;
-
-		M = N % 8;
-		for ( i = 8; i < N-M; i += 8 ) {
-			s0 += alpha + x[ ix ];
-			s1 += alpha + x[ ix+stride ];
-			s2 += alpha + x[ ix+(2*stride) ];
-			s3 += alpha + x[ ix+(3*stride) ];
-			s4 += alpha + x[ ix+(4*stride) ];
-			s5 += alpha + x[ ix+(5*stride) ];
-			s6 += alpha + x[ ix+(6*stride) ];
-			s7 += alpha + x[ ix+(7*stride) ];
-			ix += 8 * stride;
-		}
-		// Pairwise sum the accumulators:
-		s = ((s0+s1) + (s2+s3)) + ((s4+s5) + (s6+s7));
-
-		// Clean-up loop...
-		for ( i; i < N; i++ ) {
-			s += alpha + x[ ix ];
-			ix += stride;
-		}
-		return s;
-	}
-	// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
-	n = floor( N/2 );
-	n -= n % 8;
-	return dsapxsum( n, alpha, x, stride, ix ) + dsapxsum( N-n, alpha, x, stride, ix+(n*stride) ); // eslint-disable-line max-len
+	return dsapxsumpw( N, alpha, x, stride, offset );
 }
 
 
diff --git a/lib/node_modules/@stdlib/blas/ext/base/dsapxsum/manifest.json b/lib/node_modules/@stdlib/blas/ext/base/dsapxsum/manifest.json
@@ -34,7 +34,9 @@
 				"-lm"
 			],
 			"libpath": [],
-			"dependencies": []
+			"dependencies": [
+				"@stdlib/blas/ext/base/dsapxsumpw"
+			]
 		}
 	]
 }
diff --git a/lib/node_modules/@stdlib/blas/ext/base/dsapxsum/src/dsapxsum.c b/lib/node_modules/@stdlib/blas/ext/base/dsapxsum/src/dsapxsum.c
@@ -17,108 +17,18 @@
 */
 
 #include "stdlib/blas/ext/base/dsapxsum.h"
+#include "stdlib/blas/ext/base/dsapxsumpw.h"
 #include <stdint.h>
 
 /**
 * Adds a constant to each single-precision floating-point strided array element and computes the sum using extended accumulation and returning an extended precision result.
 *
-* ## Method
-*
-* -   This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
-*
-* ## References
-*
-* -   Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
-*
 * @param N       number of indexed elements
 * @param alpha   constant
 * @param X       input array
 * @param stride  stride length
 * @return        output value
 */
 double stdlib_strided_dsapxsum( const int64_t N, const float alpha, const float *X, const int64_t stride ) {
-	float *xp1;
-	float *xp2;
-	double sum;
-	int64_t ix;
-	int64_t M;
-	int64_t n;
-	int64_t i;
-	double s0;
-	double s1;
-	double s2;
-	double s3;
-	double s4;
-	double s5;
-	double s6;
-	double s7;
-	double a;
-
-	if ( N <= 0 ) {
-		return 0.0;
-	}
-	a = (double)alpha;
-	if ( N == 1 || stride == 0 ) {
-		return a + (double)X[ 0 ];
-	}
-	if ( stride < 0 ) {
-		ix = (1-N) * stride;
-	} else {
-		ix = 0;
-	}
-	if ( N < 8 ) {
-		// Use simple summation...
-		sum = 0.0;
-		for ( i = 0; i < N; i++ ) {
-			sum += a + (double)X[ ix ];
-			ix += stride;
-		}
-		return sum;
-	}
-	// Blocksize for pairwise summation: 128 (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.)
-	if ( N <= 128 ) {
-		// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
-		s0 = a + (double)X[ ix ];
-		s1 = a + (double)X[ ix+stride ];
-		s2 = a + (double)X[ ix+(2*stride) ];
-		s3 = a + (double)X[ ix+(3*stride) ];
-		s4 = a + (double)X[ ix+(4*stride) ];
-		s5 = a + (double)X[ ix+(5*stride) ];
-		s6 = a + (double)X[ ix+(6*stride) ];
-		s7 = a + (double)X[ ix+(7*stride) ];
-		ix += 8 * stride;
-
-		M = N % 8;
-		for ( i = 8; i < N-M; i += 8 ) {
-			s0 += a + (double)X[ ix ];
-			s1 += a + (double)X[ ix+stride ];
-			s2 += a + (double)X[ ix+(2*stride) ];
-			s3 += a + (double)X[ ix+(3*stride) ];
-			s4 += a + (double)X[ ix+(4*stride) ];
-			s5 += a + (double)X[ ix+(5*stride) ];
-			s6 += a + (double)X[ ix+(6*stride) ];
-			s7 += a + (double)X[ ix+(7*stride) ];
-			ix += 8 * stride;
-		}
-		// Pairwise sum the accumulators:
-		sum = ((s0+s1) + (s2+s3)) + ((s4+s5) + (s6+s7));
-
-		// Clean-up loop...
-		for (; i < N; i++ ) {
-			sum += a + (double)X[ ix ];
-			ix += stride;
-		}
-		return sum;
-	}
-	// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
-	n = N / 2;
-	n -= n % 8;
-	if ( stride < 0 ) {
-		xp1 = (float *)X + ( (n-N)*stride );
-		xp2 = (float *)X;
-	} else {
-		xp1 = (float *)X;
-		xp2 = (float *)X + ( n*stride );
-	}
-	return stdlib_strided_dsapxsum( n, alpha, xp1, stride ) + stdlib_strided_dsapxsum( N-n, alpha, xp2, stride );
+	return stdlib_strided_dsapxsumpw( N, alpha, X, stride );
 }

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,9 @@`
`34`	`34`	`"-lm"`
`35`	`35`	`],`
`36`	`36`	`"libpath": [],`
`37`		`- "dependencies": []`
	`37`	`+ "dependencies": [`
	`38`	`+ "@stdlib/blas/ext/base/dsapxsumpw"`
	`39`	`+ ]`
`38`	`40`	`}`
`39`	`41`	`]`
`40`	`42`	`}`