Skip to content

Commit 2dda40d

Browse files
authoredMar 28, 2024
use atomic operations as in the corresponding getrf
1 parent 9af2a9d commit 2dda40d

File tree

1 file changed

+37
-8
lines changed

1 file changed

+37
-8
lines changed
 

‎lapack/potrf/potrf_parallel.c

+37-8
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,14 @@ typedef struct {
105105
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
106106
} job_t;
107107

108+
#ifdef HAVE_C11
109+
#define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED)
110+
#define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED)
111+
#else
112+
#define atomic_load_long(p) (BLASLONG)(*(volatile BLASLONG*)(p))
113+
#define atomic_store_long(p, v) (*(volatile BLASLONG *)(p)) = (v)
114+
#endif
115+
108116

109117
#ifndef KERNEL_OPERATION
110118
#ifndef COMPLEX
@@ -233,14 +241,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
233241
}
234242

235243
#ifndef LOWER
244+
MB;
236245
for (i = 0; i <= mypos; i++)
237-
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
246+
atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
247+
// job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
238248
#else
249+
MB
239250
for (i = mypos; i < args -> nthreads; i++)
240-
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
251+
atomic_store_long(&job[mypos].working[i][CACHE_LINE_SIZE * bufferside], (BLASLONG)buffer[bufferside]);
252+
// job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
241253
#endif
242254

243-
WMB;
255+
// WMB;
244256
}
245257

246258
min_i = m_to - m_from;
@@ -271,14 +283,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
271283
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
272284

273285
/* thread has to wait */
274-
if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
286+
if (current != mypos)
287+
do {
288+
jw = atomic_load_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside]);
289+
} while (jw == 0);
290+
MB;
291+
292+
//while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
275293

276294
KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha,
277295
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
278296
c, lda, m_from, xxx);
279297

280298
if (m_from + min_i >= m_to) {
281-
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
299+
atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0);
300+
// job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
282301
WMB;
283302
}
284303
}
@@ -323,7 +342,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
323342
c, lda, is, xxx);
324343

325344
if (is + min_i >= m_to) {
326-
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
345+
atomic_store_long(&job[current].working[mypos][CACHE_LINE_SIZE * bufferside], job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0);
346+
// job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
327347
WMB;
328348
}
329349
}
@@ -337,9 +357,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
337357

338358
for (i = 0; i < args -> nthreads; i++) {
339359
if (i != mypos) {
340-
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
360+
for (xxx = 0; xxx < DIVIDE_RATE; xxx++)
361+
#if 1
362+
{
363+
do {
364+
jw = atomic_load_long(&job[mypos].working[i][CACHE_LINE_SIZE * xxx]);
365+
} while (jw);
366+
MB;
367+
}
368+
#else
341369
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
342-
}
370+
#endif
371+
// }
343372
}
344373
}
345374

0 commit comments

Comments
 (0)