You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: lib/node_modules/@stdlib/blas/ext/base/sdssum/README.md
-8
Original file line number
Diff line number
Diff line change
@@ -161,14 +161,8 @@ console.log( v );
161
161
162
162
<!-- /.examples -->
163
163
164
-
* * *
165
-
166
164
<sectionclass="references">
167
165
168
-
## References
169
-
170
-
- Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050][@higham:1993a].
// Blocksize for pairwise summation (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.):
* Computes the sum of single-precision floating-point strided array elements using extended accumulation.
37
30
*
38
-
* ## Method
39
-
*
40
-
* - This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
41
-
*
42
-
* ## References
43
-
*
44
-
* - Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
45
-
*
46
31
* @param {PositiveInteger} N - number of indexed elements
47
32
* @param {Float32Array} x - input array
48
33
* @param {integer} stride - stride length
@@ -60,74 +45,7 @@ var BLOCKSIZE = 128;
60
45
* // returns 5.0
61
46
*/
62
47
functionsdssum(N,x,stride,offset){
63
-
varix;
64
-
vars0;
65
-
vars1;
66
-
vars2;
67
-
vars3;
68
-
vars4;
69
-
vars5;
70
-
vars6;
71
-
vars7;
72
-
varM;
73
-
vars;
74
-
varn;
75
-
vari;
76
-
77
-
if(N<=0){
78
-
return0.0;
79
-
}
80
-
if(N===1||stride===0){
81
-
returnx[offset];
82
-
}
83
-
ix=offset;
84
-
if(N<8){
85
-
// Use simple summation...
86
-
s=0.0;
87
-
for(i=0;i<N;i++){
88
-
s+=x[ix];
89
-
ix+=stride;
90
-
}
91
-
returnfloat64ToFloat32(s);
92
-
}
93
-
if(N<=BLOCKSIZE){
94
-
// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
95
-
s0=x[ix];
96
-
s1=x[ix+stride];
97
-
s2=x[ix+(2*stride)];
98
-
s3=x[ix+(3*stride)];
99
-
s4=x[ix+(4*stride)];
100
-
s5=x[ix+(5*stride)];
101
-
s6=x[ix+(6*stride)];
102
-
s7=x[ix+(7*stride)];
103
-
ix+=8*stride;
104
-
105
-
M=N%8;
106
-
for(i=8;i<N-M;i+=8){
107
-
s0+=x[ix];
108
-
s1+=x[ix+stride];
109
-
s2+=x[ix+(2*stride)];
110
-
s3+=x[ix+(3*stride)];
111
-
s4+=x[ix+(4*stride)];
112
-
s5+=x[ix+(5*stride)];
113
-
s6+=x[ix+(6*stride)];
114
-
s7+=x[ix+(7*stride)];
115
-
ix+=8*stride;
116
-
}
117
-
// Pairwise sum the accumulators:
118
-
s=((s0+s1)+(s2+s3))+((s4+s5)+(s6+s7));
119
-
120
-
// Clean-up loop...
121
-
for(i;i<N;i++){
122
-
s+=x[ix];
123
-
ix+=stride;
124
-
}
125
-
returnfloat64ToFloat32(s);
126
-
}
127
-
// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
* Computes the sum of single-precision floating-point strided array elements using extended accumulation.
31
30
*
32
-
* ## Method
33
-
*
34
-
* - This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
35
-
*
36
-
* ## References
37
-
*
38
-
* - Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
39
-
*
40
31
* @param {PositiveInteger} N - number of indexed elements
41
32
* @param {Float32Array} x - input array
42
33
* @param {integer} stride - stride length
@@ -52,31 +43,7 @@ var sum = require( './ndarray.js' );
Copy file name to clipboardExpand all lines: lib/node_modules/@stdlib/blas/ext/base/sdssum/src/sdssum.c
+2-90
Original file line number
Diff line number
Diff line change
@@ -17,105 +17,17 @@
17
17
*/
18
18
19
19
#include"stdlib/blas/ext/base/sdssum.h"
20
+
#include"stdlib/blas/ext/base/sdssumpw.h"
20
21
#include<stdint.h>
21
22
22
23
/**
23
24
* Computes the sum of single-precision floating-point strided array elements using extended accumulation.
24
25
*
25
-
* ## Method
26
-
*
27
-
* - This implementation uses pairwise summation, which accrues rounding error `O(log2 N)` instead of `O(N)`. The recursion depth is also `O(log2 N)`.
28
-
*
29
-
* ## References
30
-
*
31
-
* - Higham, Nicholas J. 1993. "The Accuracy of Floating Point Summation." _SIAM Journal on Scientific Computing_ 14 (4): 783–99. doi:[10.1137/0914050](https://doi.org/10.1137/0914050).
// Blocksize for pairwise summation: 128 (NOTE: decreasing the blocksize decreases rounding error as more pairs are summed, but also decreases performance. Because the inner loop is unrolled eight times, the blocksize is effectively `16`.)
76
-
if ( N <= 128 ) {
77
-
// Sum a block with 8 accumulators (by loop unrolling, we lower the effective blocksize to 16)...
78
-
s0=X[ ix ];
79
-
s1=X[ ix+stride ];
80
-
s2=X[ ix+(2*stride) ];
81
-
s3=X[ ix+(3*stride) ];
82
-
s4=X[ ix+(4*stride) ];
83
-
s5=X[ ix+(5*stride) ];
84
-
s6=X[ ix+(6*stride) ];
85
-
s7=X[ ix+(7*stride) ];
86
-
ix+=8*stride;
87
-
88
-
M=N % 8;
89
-
for ( i=8; i<N-M; i+=8 ) {
90
-
s0+=X[ ix ];
91
-
s1+=X[ ix+stride ];
92
-
s2+=X[ ix+(2*stride) ];
93
-
s3+=X[ ix+(3*stride) ];
94
-
s4+=X[ ix+(4*stride) ];
95
-
s5+=X[ ix+(5*stride) ];
96
-
s6+=X[ ix+(6*stride) ];
97
-
s7+=X[ ix+(7*stride) ];
98
-
ix+=8*stride;
99
-
}
100
-
// Pairwise sum the accumulators:
101
-
sum= ((s0+s1) + (s2+s3)) + ((s4+s5) + (s6+s7));
102
-
103
-
// Clean-up loop...
104
-
for (; i<N; i++ ) {
105
-
sum+=X[ ix ];
106
-
ix+=stride;
107
-
}
108
-
returnsum;
109
-
}
110
-
// Recurse by dividing by two, but avoiding non-multiples of unroll factor...
111
-
n=N / 2;
112
-
n-=n % 8;
113
-
if ( stride<0 ) {
114
-
xp1= (float*)X+ ( (n-N)*stride );
115
-
xp2= (float*)X;
116
-
} else {
117
-
xp1= (float*)X;
118
-
xp2= (float*)X+ ( n*stride );
119
-
}
120
-
returnstdlib_strided_sdssum( n, xp1, stride ) +stdlib_strided_sdssum( N-n, xp2, stride );
0 commit comments