Skip to content

Commit 74acddc

Browse files
committed
Refactor to wrap specific implementation
1 parent c5fd0ea commit 74acddc

File tree

6 files changed

+9
-220
lines changed

6 files changed

+9
-220
lines changed

lib/node_modules/@stdlib/blas/ext/base/sdssum/README.md

-8
Original file line numberDiff line numberDiff line change
@@ -161,14 +161,8 @@ console.log( v );
161161

162162
<!-- /.examples -->
163163

164-
* * *
165-
166164
<section class="references">
167165

168-
## References
169-
170-
- Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050][@higham:1993a].
171-
172166
</section>
173167

174168
<!-- /.references -->
@@ -179,8 +173,6 @@ console.log( v );
179173

180174
[mdn-typed-array]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/TypedArray
181175

182-
[@higham:1993a]: https://doi.org/10.1137/0914050
183-
184176
</section>
185177

186178
<!-- /.links -->

lib/node_modules/@stdlib/blas/ext/base/sdssum/lib/ndarray.js

+2-84
Original file line numberDiff line numberDiff line change
@@ -20,29 +20,14 @@
2020

2121
// MODULES //
2222

23-
var float64ToFloat32 = require( '@stdlib/number/float64/base/to-float32' );
24-
var floor = require( '@stdlib/math/base/special/floor' );
25-
26-
27-
// VARIABLES //
28-
29-
// Blocksize for pairwise summation (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.):
30-
var BLOCKSIZE = 128;
23+
var sdssumpw = require( '@stdlib/blas/ext/base/sdssumpw' ).ndarray;
3124

3225

3326
// MAIN //
3427

3528
/**
3629
* Computes the sum of single-precision floating-point strided array elements using extended accumulation.
3730
*
38-
* ## Method
39-
*
40-
* - This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
41-
*
42-
* ## References
43-
*
44-
* - Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
45-
*
4631
* @param {PositiveInteger} N - number of indexed elements
4732
* @param {Float32Array} x - input array
4833
* @param {integer} stride - stride length
@@ -60,74 +45,7 @@ var BLOCKSIZE = 128;
6045
* // returns 5.0
6146
*/
6247
function sdssum( N, x, stride, offset ) {
63-
var ix;
64-
var s0;
65-
var s1;
66-
var s2;
67-
var s3;
68-
var s4;
69-
var s5;
70-
var s6;
71-
var s7;
72-
var M;
73-
var s;
74-
var n;
75-
var i;
76-
77-
if ( N <= 0 ) {
78-
return 0.0;
79-
}
80-
if ( N === 1 || stride === 0 ) {
81-
return x[ offset ];
82-
}
83-
ix = offset;
84-
if ( N < 8 ) {
85-
// Use simple summation...
86-
s = 0.0;
87-
for ( i = 0; i < N; i++ ) {
88-
s += x[ ix ];
89-
ix += stride;
90-
}
91-
return float64ToFloat32( s );
92-
}
93-
if ( N <= BLOCKSIZE ) {
94-
// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
95-
s0 = x[ ix ];
96-
s1 = x[ ix+stride ];
97-
s2 = x[ ix+(2*stride) ];
98-
s3 = x[ ix+(3*stride) ];
99-
s4 = x[ ix+(4*stride) ];
100-
s5 = x[ ix+(5*stride) ];
101-
s6 = x[ ix+(6*stride) ];
102-
s7 = x[ ix+(7*stride) ];
103-
ix += 8 * stride;
104-
105-
M = N % 8;
106-
for ( i = 8; i < N-M; i += 8 ) {
107-
s0 += x[ ix ];
108-
s1 += x[ ix+stride ];
109-
s2 += x[ ix+(2*stride) ];
110-
s3 += x[ ix+(3*stride) ];
111-
s4 += x[ ix+(4*stride) ];
112-
s5 += x[ ix+(5*stride) ];
113-
s6 += x[ ix+(6*stride) ];
114-
s7 += x[ ix+(7*stride) ];
115-
ix += 8 * stride;
116-
}
117-
// Pairwise sum the accumulators:
118-
s = ((s0+s1) + (s2+s3)) + ((s4+s5) + (s6+s7));
119-
120-
// Clean-up loop...
121-
for ( i; i < N; i++ ) {
122-
s += x[ ix ];
123-
ix += stride;
124-
}
125-
return float64ToFloat32( s );
126-
}
127-
// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
128-
n = floor( N/2 );
129-
n -= n % 8;
130-
return float64ToFloat32( sdssum( n, x, stride, ix ) + sdssum( N-n, x, stride, ix+(n*stride) ) ); // eslint-disable-line max-len
48+
return sdssumpw( N, x, stride, offset );
13149
}
13250

13351

lib/node_modules/@stdlib/blas/ext/base/sdssum/lib/sdssum.js

+2-35
Original file line numberDiff line numberDiff line change
@@ -20,23 +20,14 @@
2020

2121
// MODULES //
2222

23-
var float64ToFloat32 = require( '@stdlib/number/float64/base/to-float32' );
24-
var sum = require( './ndarray.js' );
23+
var sdssumpw = require( '@stdlib/blas/ext/base/sdssumpw' );
2524

2625

2726
// MAIN //
2827

2928
/**
3029
* Computes the sum of single-precision floating-point strided array elements using extended accumulation.
3130
*
32-
* ## Method
33-
*
34-
* - This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
35-
*
36-
* ## References
37-
*
38-
* - Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
39-
*
4031
* @param {PositiveInteger} N - number of indexed elements
4132
* @param {Float32Array} x - input array
4233
* @param {integer} stride - stride length
@@ -52,31 +43,7 @@ var sum = require( './ndarray.js' );
5243
* // returns 1.0
5344
*/
5445
function sdssum( N, x, stride ) {
55-
var ix;
56-
var s;
57-
var i;
58-
59-
if ( N <= 0 ) {
60-
return 0.0;
61-
}
62-
if ( N === 1 || stride === 0 ) {
63-
return x[ 0 ];
64-
}
65-
if ( stride < 0 ) {
66-
ix = (1-N) * stride;
67-
} else {
68-
ix = 0;
69-
}
70-
if ( N < 8 ) {
71-
// Use simple summation...
72-
s = 0.0;
73-
for ( i = 0; i < N; i++ ) {
74-
s += x[ ix ];
75-
ix += stride;
76-
}
77-
return float64ToFloat32( s );
78-
}
79-
return sum( N, x, stride, ix );
46+
return sdssumpw( N, x, stride );
8047
}
8148

8249

lib/node_modules/@stdlib/blas/ext/base/sdssum/manifest.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@
3434
"-lm"
3535
],
3636
"libpath": [],
37-
"dependencies": []
37+
"dependencies": [
38+
"@stdlib/blas/ext/base/sdssumpw"
39+
]
3840
}
3941
]
4042
}

lib/node_modules/@stdlib/blas/ext/base/sdssum/package.json

-2
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,6 @@
6363
"sum",
6464
"total",
6565
"summation",
66-
"pairwise",
67-
"pw",
6866
"strided",
6967
"strided array",
7068
"typed",

lib/node_modules/@stdlib/blas/ext/base/sdssum/src/sdssum.c

+2-90
Original file line numberDiff line numberDiff line change
@@ -17,105 +17,17 @@
1717
*/
1818

1919
#include "stdlib/blas/ext/base/sdssum.h"
20+
#include "stdlib/blas/ext/base/sdssumpw.h"
2021
#include <stdint.h>
2122

2223
/**
2324
* Computes the sum of single-precision floating-point strided array elements using extended accumulation.
2425
*
25-
* ## Method
26-
*
27-
* - This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
28-
*
29-
* ## References
30-
*
31-
* - Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
32-
*
3326
* @param N number of indexed elements
3427
* @param X input array
3528
* @param stride stride length
3629
* @return output value
3730
*/
3831
float stdlib_strided_sdssum( const int64_t N, const float *X, const int64_t stride ) {
39-
float *xp1;
40-
float *xp2;
41-
double sum;
42-
int64_t ix;
43-
int64_t M;
44-
int64_t n;
45-
int64_t i;
46-
double s0;
47-
double s1;
48-
double s2;
49-
double s3;
50-
double s4;
51-
double s5;
52-
double s6;
53-
double s7;
54-
55-
if ( N <= 0 ) {
56-
return 0.0f;
57-
}
58-
if ( N == 1 || stride == 0 ) {
59-
return X[ 0 ];
60-
}
61-
if ( stride < 0 ) {
62-
ix = (1-N) * stride;
63-
} else {
64-
ix = 0;
65-
}
66-
if ( N < 8 ) {
67-
// Use simple summation...
68-
sum = 0.0;
69-
for ( i = 0; i < N; i++ ) {
70-
sum += X[ ix ];
71-
ix += stride;
72-
}
73-
return sum;
74-
}
75-
// Blocksize for pairwise summation: 128 (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.)
76-
if ( N <= 128 ) {
77-
// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
78-
s0 = X[ ix ];
79-
s1 = X[ ix+stride ];
80-
s2 = X[ ix+(2*stride) ];
81-
s3 = X[ ix+(3*stride) ];
82-
s4 = X[ ix+(4*stride) ];
83-
s5 = X[ ix+(5*stride) ];
84-
s6 = X[ ix+(6*stride) ];
85-
s7 = X[ ix+(7*stride) ];
86-
ix += 8 * stride;
87-
88-
M = N % 8;
89-
for ( i = 8; i < N-M; i += 8 ) {
90-
s0 += X[ ix ];
91-
s1 += X[ ix+stride ];
92-
s2 += X[ ix+(2*stride) ];
93-
s3 += X[ ix+(3*stride) ];
94-
s4 += X[ ix+(4*stride) ];
95-
s5 += X[ ix+(5*stride) ];
96-
s6 += X[ ix+(6*stride) ];
97-
s7 += X[ ix+(7*stride) ];
98-
ix += 8 * stride;
99-
}
100-
// Pairwise sum the accumulators:
101-
sum = ((s0+s1) + (s2+s3)) + ((s4+s5) + (s6+s7));
102-
103-
// Clean-up loop...
104-
for (; i < N; i++ ) {
105-
sum += X[ ix ];
106-
ix += stride;
107-
}
108-
return sum;
109-
}
110-
// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
111-
n = N / 2;
112-
n -= n % 8;
113-
if ( stride < 0 ) {
114-
xp1 = (float *)X + ( (n-N)*stride );
115-
xp2 = (float *)X;
116-
} else {
117-
xp1 = (float *)X;
118-
xp2 = (float *)X + ( n*stride );
119-
}
120-
return stdlib_strided_sdssum( n, xp1, stride ) + stdlib_strided_sdssum( N-n, xp2, stride );
32+
return stdlib_strided_sdssumpw( N, X, stride );
12133
}

0 commit comments

Comments
 (0)