Skip to content

Commit f075fe1

Browse files
committed
Refactor to wrap specific implementation
1 parent 149ca46 commit f075fe1

File tree

4 files changed

+9
-210
lines changed

4 files changed

+9
-210
lines changed

lib/node_modules/@stdlib/blas/ext/base/dsapxsum/lib/dsapxsum.js

+2-34
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,14 @@
2020

2121
// MODULES //
2222

23-
var sum = require( './ndarray.js' );
23+
var dsapxsumpw = require( '@stdlib/blas/ext/base/dsapxsumpw' );
2424

2525

2626
// MAIN //
2727

2828
/**
2929
* Adds a constant to each single-precision floating-point strided array element and computes the sum using extended accumulation and returning an extended precision result.
3030
*
31-
* ## Method
32-
*
33-
* - This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
34-
*
35-
* ## References
36-
*
37-
* - Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
38-
*
3931
* @param {PositiveInteger} N - number of indexed elements
4032
* @param {number} alpha - constant
4133
* @param {Float32Array} x - input array
@@ -52,31 +44,7 @@ var sum = require( './ndarray.js' );
5244
* // returns 16.0
5345
*/
5446
function dsapxsum( N, alpha, x, stride ) {
55-
var ix;
56-
var s;
57-
var i;
58-
59-
if ( N <= 0 ) {
60-
return 0.0;
61-
}
62-
if ( N === 1 || stride === 0 ) {
63-
return alpha + x[ 0 ];
64-
}
65-
if ( stride < 0 ) {
66-
ix = (1-N) * stride;
67-
} else {
68-
ix = 0;
69-
}
70-
if ( N < 8 ) {
71-
// Use simple summation...
72-
s = 0.0;
73-
for ( i = 0; i < N; i++ ) {
74-
s += alpha + x[ ix ];
75-
ix += stride;
76-
}
77-
return s;
78-
}
79-
return sum( N, alpha, x, stride, ix );
47+
return dsapxsumpw( N, alpha, x, stride );
8048
}
8149

8250

lib/node_modules/@stdlib/blas/ext/base/dsapxsum/lib/ndarray.js

+2-83
Original file line numberDiff line numberDiff line change
@@ -20,28 +20,14 @@
2020

2121
// MODULES //
2222

23-
var floor = require( '@stdlib/math/base/special/floor' );
24-
25-
26-
// VARIABLES //
27-
28-
// Blocksize for pairwise summation (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.):
29-
var BLOCKSIZE = 128;
23+
var dsapxsumpw = require( '@stdlib/blas/ext/base/dsapxsumpw' ).ndarray;
3024

3125

3226
// MAIN //
3327

3428
/**
3529
* Adds a constant to each single-precision floating-point strided array element and computes the sum using extended accumulation and returning an extended precision result.
3630
*
37-
* ## Method
38-
*
39-
* - This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
40-
*
41-
* ## References
42-
*
43-
* - Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
44-
*
4531
* @param {PositiveInteger} N - number of indexed elements
4632
* @param {number} alpha - constant
4733
* @param {Float32Array} x - input array
@@ -60,74 +46,7 @@ var BLOCKSIZE = 128;
6046
* // returns 25.0
6147
*/
6248
function dsapxsum( N, alpha, x, stride, offset ) {
63-
var ix;
64-
var s0;
65-
var s1;
66-
var s2;
67-
var s3;
68-
var s4;
69-
var s5;
70-
var s6;
71-
var s7;
72-
var M;
73-
var s;
74-
var n;
75-
var i;
76-
77-
if ( N <= 0 ) {
78-
return 0.0;
79-
}
80-
if ( N === 1 || stride === 0 ) {
81-
return alpha + x[ offset ];
82-
}
83-
ix = offset;
84-
if ( N < 8 ) {
85-
// Use simple summation...
86-
s = 0.0;
87-
for ( i = 0; i < N; i++ ) {
88-
s += alpha + x[ ix ];
89-
ix += stride;
90-
}
91-
return s;
92-
}
93-
if ( N <= BLOCKSIZE ) {
94-
// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
95-
s0 = alpha + x[ ix ];
96-
s1 = alpha + x[ ix+stride ];
97-
s2 = alpha + x[ ix+(2*stride) ];
98-
s3 = alpha + x[ ix+(3*stride) ];
99-
s4 = alpha + x[ ix+(4*stride) ];
100-
s5 = alpha + x[ ix+(5*stride) ];
101-
s6 = alpha + x[ ix+(6*stride) ];
102-
s7 = alpha + x[ ix+(7*stride) ];
103-
ix += 8 * stride;
104-
105-
M = N % 8;
106-
for ( i = 8; i < N-M; i += 8 ) {
107-
s0 += alpha + x[ ix ];
108-
s1 += alpha + x[ ix+stride ];
109-
s2 += alpha + x[ ix+(2*stride) ];
110-
s3 += alpha + x[ ix+(3*stride) ];
111-
s4 += alpha + x[ ix+(4*stride) ];
112-
s5 += alpha + x[ ix+(5*stride) ];
113-
s6 += alpha + x[ ix+(6*stride) ];
114-
s7 += alpha + x[ ix+(7*stride) ];
115-
ix += 8 * stride;
116-
}
117-
// Pairwise sum the accumulators:
118-
s = ((s0+s1) + (s2+s3)) + ((s4+s5) + (s6+s7));
119-
120-
// Clean-up loop...
121-
for ( i; i < N; i++ ) {
122-
s += alpha + x[ ix ];
123-
ix += stride;
124-
}
125-
return s;
126-
}
127-
// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
128-
n = floor( N/2 );
129-
n -= n % 8;
130-
return dsapxsum( n, alpha, x, stride, ix ) + dsapxsum( N-n, alpha, x, stride, ix+(n*stride) ); // eslint-disable-line max-len
49+
return dsapxsumpw( N, alpha, x, stride, offset );
13150
}
13251

13352

lib/node_modules/@stdlib/blas/ext/base/dsapxsum/manifest.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@
3434
"-lm"
3535
],
3636
"libpath": [],
37-
"dependencies": []
37+
"dependencies": [
38+
"@stdlib/blas/ext/base/dsapxsumpw"
39+
]
3840
}
3941
]
4042
}

lib/node_modules/@stdlib/blas/ext/base/dsapxsum/src/dsapxsum.c

+2-92
Original file line numberDiff line numberDiff line change
@@ -17,108 +17,18 @@
1717
*/
1818

1919
#include "stdlib/blas/ext/base/dsapxsum.h"
20+
#include "stdlib/blas/ext/base/dsapxsumpw.h"
2021
#include <stdint.h>
2122

2223
/**
2324
* Adds a constant to each single-precision floating-point strided array element and computes the sum using extended accumulation and returning an extended precision result.
2425
*
25-
* ## Method
26-
*
27-
* - This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
28-
*
29-
* ## References
30-
*
31-
* - Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
32-
*
3326
* @param N number of indexed elements
3427
* @param alpha constant
3528
* @param X input array
3629
* @param stride stride length
3730
* @return output value
3831
*/
3932
double stdlib_strided_dsapxsum( const int64_t N, const float alpha, const float *X, const int64_t stride ) {
40-
float *xp1;
41-
float *xp2;
42-
double sum;
43-
int64_t ix;
44-
int64_t M;
45-
int64_t n;
46-
int64_t i;
47-
double s0;
48-
double s1;
49-
double s2;
50-
double s3;
51-
double s4;
52-
double s5;
53-
double s6;
54-
double s7;
55-
double a;
56-
57-
if ( N <= 0 ) {
58-
return 0.0;
59-
}
60-
a = (double)alpha;
61-
if ( N == 1 || stride == 0 ) {
62-
return a + (double)X[ 0 ];
63-
}
64-
if ( stride < 0 ) {
65-
ix = (1-N) * stride;
66-
} else {
67-
ix = 0;
68-
}
69-
if ( N < 8 ) {
70-
// Use simple summation...
71-
sum = 0.0;
72-
for ( i = 0; i < N; i++ ) {
73-
sum += a + (double)X[ ix ];
74-
ix += stride;
75-
}
76-
return sum;
77-
}
78-
// Blocksize for pairwise summation: 128 (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.)
79-
if ( N <= 128 ) {
80-
// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
81-
s0 = a + (double)X[ ix ];
82-
s1 = a + (double)X[ ix+stride ];
83-
s2 = a + (double)X[ ix+(2*stride) ];
84-
s3 = a + (double)X[ ix+(3*stride) ];
85-
s4 = a + (double)X[ ix+(4*stride) ];
86-
s5 = a + (double)X[ ix+(5*stride) ];
87-
s6 = a + (double)X[ ix+(6*stride) ];
88-
s7 = a + (double)X[ ix+(7*stride) ];
89-
ix += 8 * stride;
90-
91-
M = N % 8;
92-
for ( i = 8; i < N-M; i += 8 ) {
93-
s0 += a + (double)X[ ix ];
94-
s1 += a + (double)X[ ix+stride ];
95-
s2 += a + (double)X[ ix+(2*stride) ];
96-
s3 += a + (double)X[ ix+(3*stride) ];
97-
s4 += a + (double)X[ ix+(4*stride) ];
98-
s5 += a + (double)X[ ix+(5*stride) ];
99-
s6 += a + (double)X[ ix+(6*stride) ];
100-
s7 += a + (double)X[ ix+(7*stride) ];
101-
ix += 8 * stride;
102-
}
103-
// Pairwise sum the accumulators:
104-
sum = ((s0+s1) + (s2+s3)) + ((s4+s5) + (s6+s7));
105-
106-
// Clean-up loop...
107-
for (; i < N; i++ ) {
108-
sum += a + (double)X[ ix ];
109-
ix += stride;
110-
}
111-
return sum;
112-
}
113-
// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
114-
n = N / 2;
115-
n -= n % 8;
116-
if ( stride < 0 ) {
117-
xp1 = (float *)X + ( (n-N)*stride );
118-
xp2 = (float *)X;
119-
} else {
120-
xp1 = (float *)X;
121-
xp2 = (float *)X + ( n*stride );
122-
}
123-
return stdlib_strided_dsapxsum( n, alpha, xp1, stride ) + stdlib_strided_dsapxsum( N-n, alpha, xp2, stride );
33+
return stdlib_strided_dsapxsumpw( N, alpha, X, stride );
12434
}

0 commit comments

Comments
 (0)