Refactor to wrap specific implementation

kgryte · kgryte · commit 74acddc8898d · 2020-06-17T12:51:50.000-07:00
diff --git a/lib/node_modules/@stdlib/blas/ext/base/sdssum/README.md b/lib/node_modules/@stdlib/blas/ext/base/sdssum/README.md
@@ -161,14 +161,8 @@ console.log( v );
 
 <!-- /.examples -->
 
-* * *
-
 <section class="references">
 
-## References
-
--   Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050][@higham:1993a].
-
 </section>
 
 <!-- /.references -->
@@ -179,8 +173,6 @@ console.log( v );
 
 [mdn-typed-array]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/TypedArray
 
-[@higham:1993a]: https://doi.org/10.1137/0914050
-
 </section>
 
 <!-- /.links -->
diff --git a/lib/node_modules/@stdlib/blas/ext/base/sdssum/lib/ndarray.js b/lib/node_modules/@stdlib/blas/ext/base/sdssum/lib/ndarray.js
@@ -20,29 +20,14 @@
 
 // MODULES //
 
-var float64ToFloat32 = require( '@stdlib/number/float64/base/to-float32' );
-var floor = require( '@stdlib/math/base/special/floor' );
-
-
-// VARIABLES //
-
-// Blocksize for pairwise summation (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.):
-var BLOCKSIZE = 128;
+var sdssumpw = require( '@stdlib/blas/ext/base/sdssumpw' ).ndarray;
 
 
 // MAIN //
 
 /**
 * Computes the sum of single-precision floating-point strided array elements using extended accumulation.
 *
-* ## Method
-*
-* -   This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
-*
-* ## References
-*
-* -   Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
-*
 * @param {PositiveInteger} N - number of indexed elements
 * @param {Float32Array} x - input array
 * @param {integer} stride - stride length
@@ -60,74 +45,7 @@ var BLOCKSIZE = 128;
 * // returns 5.0
 */
 function sdssum( N, x, stride, offset ) {
-	var ix;
-	var s0;
-	var s1;
-	var s2;
-	var s3;
-	var s4;
-	var s5;
-	var s6;
-	var s7;
-	var M;
-	var s;
-	var n;
-	var i;
-
-	if ( N <= 0 ) {
-		return 0.0;
-	}
-	if ( N === 1 || stride === 0 ) {
-		return x[ offset ];
-	}
-	ix = offset;
-	if ( N < 8 ) {
-		// Use simple summation...
-		s = 0.0;
-		for ( i = 0; i < N; i++ ) {
-			s += x[ ix ];
-			ix += stride;
-		}
-		return float64ToFloat32( s );
-	}
-	if ( N <= BLOCKSIZE ) {
-		// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
-		s0 = x[ ix ];
-		s1 = x[ ix+stride ];
-		s2 = x[ ix+(2*stride) ];
-		s3 = x[ ix+(3*stride) ];
-		s4 = x[ ix+(4*stride) ];
-		s5 = x[ ix+(5*stride) ];
-		s6 = x[ ix+(6*stride) ];
-		s7 = x[ ix+(7*stride) ];
-		ix += 8 * stride;
-
-		M = N % 8;
-		for ( i = 8; i < N-M; i += 8 ) {
-			s0 += x[ ix ];
-			s1 += x[ ix+stride ];
-			s2 += x[ ix+(2*stride) ];
-			s3 += x[ ix+(3*stride) ];
-			s4 += x[ ix+(4*stride) ];
-			s5 += x[ ix+(5*stride) ];
-			s6 += x[ ix+(6*stride) ];
-			s7 += x[ ix+(7*stride) ];
-			ix += 8 * stride;
-		}
-		// Pairwise sum the accumulators:
-		s = ((s0+s1) + (s2+s3)) + ((s4+s5) + (s6+s7));
-
-		// Clean-up loop...
-		for ( i; i < N; i++ ) {
-			s += x[ ix ];
-			ix += stride;
-		}
-		return float64ToFloat32( s );
-	}
-	// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
-	n = floor( N/2 );
-	n -= n % 8;
-	return float64ToFloat32( sdssum( n, x, stride, ix ) + sdssum( N-n, x, stride, ix+(n*stride) ) ); // eslint-disable-line max-len
+	return sdssumpw( N, x, stride, offset );
 }
 
 
diff --git a/lib/node_modules/@stdlib/blas/ext/base/sdssum/lib/sdssum.js b/lib/node_modules/@stdlib/blas/ext/base/sdssum/lib/sdssum.js
@@ -20,23 +20,14 @@
 
 // MODULES //
 
-var float64ToFloat32 = require( '@stdlib/number/float64/base/to-float32' );
-var sum = require( './ndarray.js' );
+var sdssumpw = require( '@stdlib/blas/ext/base/sdssumpw' );
 
 
 // MAIN //
 
 /**
 * Computes the sum of single-precision floating-point strided array elements using extended accumulation.
 *
-* ## Method
-*
-* -   This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
-*
-* ## References
-*
-* -   Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
-*
 * @param {PositiveInteger} N - number of indexed elements
 * @param {Float32Array} x - input array
 * @param {integer} stride - stride length
@@ -52,31 +43,7 @@ var sum = require( './ndarray.js' );
 * // returns 1.0
 */
 function sdssum( N, x, stride ) {
-	var ix;
-	var s;
-	var i;
-
-	if ( N <= 0 ) {
-		return 0.0;
-	}
-	if ( N === 1 || stride === 0 ) {
-		return x[ 0 ];
-	}
-	if ( stride < 0 ) {
-		ix = (1-N) * stride;
-	} else {
-		ix = 0;
-	}
-	if ( N < 8 ) {
-		// Use simple summation...
-		s = 0.0;
-		for ( i = 0; i < N; i++ ) {
-			s += x[ ix ];
-			ix += stride;
-		}
-		return float64ToFloat32( s );
-	}
-	return sum( N, x, stride, ix );
+	return sdssumpw( N, x, stride );
 }
 
 
diff --git a/lib/node_modules/@stdlib/blas/ext/base/sdssum/manifest.json b/lib/node_modules/@stdlib/blas/ext/base/sdssum/manifest.json
@@ -34,7 +34,9 @@
 				"-lm"
 			],
 			"libpath": [],
-			"dependencies": []
+			"dependencies": [
+				"@stdlib/blas/ext/base/sdssumpw"
+			]
 		}
 	]
 }
diff --git a/lib/node_modules/@stdlib/blas/ext/base/sdssum/package.json b/lib/node_modules/@stdlib/blas/ext/base/sdssum/package.json
@@ -63,8 +63,6 @@
     "sum",
     "total",
     "summation",
-    "pairwise",
-    "pw",
     "strided",
     "strided array",
     "typed",
diff --git a/lib/node_modules/@stdlib/blas/ext/base/sdssum/src/sdssum.c b/lib/node_modules/@stdlib/blas/ext/base/sdssum/src/sdssum.c
@@ -17,105 +17,17 @@
 */
 
 #include "stdlib/blas/ext/base/sdssum.h"
+#include "stdlib/blas/ext/base/sdssumpw.h"
 #include <stdint.h>
 
 /**
 * Computes the sum of single-precision floating-point strided array elements using extended accumulation.
 *
-* ## Method
-*
-* -   This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
-*
-* ## References
-*
-* -   Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
-*
 * @param N       number of indexed elements
 * @param X       input array
 * @param stride  stride length
 * @return        output value
 */
 float stdlib_strided_sdssum( const int64_t N, const float *X, const int64_t stride ) {
-	float *xp1;
-	float *xp2;
-	double sum;
-	int64_t ix;
-	int64_t M;
-	int64_t n;
-	int64_t i;
-	double s0;
-	double s1;
-	double s2;
-	double s3;
-	double s4;
-	double s5;
-	double s6;
-	double s7;
-
-	if ( N <= 0 ) {
-		return 0.0f;
-	}
-	if ( N == 1 || stride == 0 ) {
-		return X[ 0 ];
-	}
-	if ( stride < 0 ) {
-		ix = (1-N) * stride;
-	} else {
-		ix = 0;
-	}
-	if ( N < 8 ) {
-		// Use simple summation...
-		sum = 0.0;
-		for ( i = 0; i < N; i++ ) {
-			sum += X[ ix ];
-			ix += stride;
-		}
-		return sum;
-	}
-	// Blocksize for pairwise summation: 128 (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.)
-	if ( N <= 128 ) {
-		// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
-		s0 = X[ ix ];
-		s1 = X[ ix+stride ];
-		s2 = X[ ix+(2*stride) ];
-		s3 = X[ ix+(3*stride) ];
-		s4 = X[ ix+(4*stride) ];
-		s5 = X[ ix+(5*stride) ];
-		s6 = X[ ix+(6*stride) ];
-		s7 = X[ ix+(7*stride) ];
-		ix += 8 * stride;
-
-		M = N % 8;
-		for ( i = 8; i < N-M; i += 8 ) {
-			s0 += X[ ix ];
-			s1 += X[ ix+stride ];
-			s2 += X[ ix+(2*stride) ];
-			s3 += X[ ix+(3*stride) ];
-			s4 += X[ ix+(4*stride) ];
-			s5 += X[ ix+(5*stride) ];
-			s6 += X[ ix+(6*stride) ];
-			s7 += X[ ix+(7*stride) ];
-			ix += 8 * stride;
-		}
-		// Pairwise sum the accumulators:
-		sum = ((s0+s1) + (s2+s3)) + ((s4+s5) + (s6+s7));
-
-		// Clean-up loop...
-		for (; i < N; i++ ) {
-			sum += X[ ix ];
-			ix += stride;
-		}
-		return sum;
-	}
-	// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
-	n = N / 2;
-	n -= n % 8;
-	if ( stride < 0 ) {
-		xp1 = (float *)X + ( (n-N)*stride );
-		xp2 = (float *)X;
-	} else {
-		xp1 = (float *)X;
-		xp2 = (float *)X + ( n*stride );
-	}
-	return stdlib_strided_sdssum( n, xp1, stride ) + stdlib_strided_sdssum( N-n, xp2, stride );
+	return stdlib_strided_sdssumpw( N, X, stride );
 }

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,9 @@`
`34`	`34`	`"-lm"`
`35`	`35`	`],`
`36`	`36`	`"libpath": [],`
`37`		`- "dependencies": []`
	`37`	`+ "dependencies": [`
	`38`	`+ "@stdlib/blas/ext/base/sdssumpw"`
	`39`	`+ ]`
`38`	`40`	`}`
`39`	`41`	`]`
`40`	`42`	`}`