@@ -105,6 +105,14 @@ typedef struct {
105
105
BLASLONG working [MAX_CPU_NUMBER ][CACHE_LINE_SIZE * DIVIDE_RATE ];
106
106
} job_t ;
107
107
108
+ #ifdef HAVE_C11
109
+ #define atomic_load_long (p ) __atomic_load_n(p, __ATOMIC_RELAXED)
110
+ #define atomic_store_long (p , v ) __atomic_store_n(p, v, __ATOMIC_RELAXED)
111
+ #else
112
+ #define atomic_load_long (p ) (BLASLONG)(*(volatile BLASLONG*)(p))
113
+ #define atomic_store_long (p , v ) (*(volatile BLASLONG *)(p)) = (v)
114
+ #endif
115
+
108
116
109
117
#ifndef KERNEL_OPERATION
110
118
#ifndef COMPLEX
@@ -233,14 +241,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
233
241
}
234
242
235
243
#ifndef LOWER
244
+ MB ;
236
245
for (i = 0 ; i <= mypos ; i ++ )
237
- job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
246
+ atomic_store_long (& job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ], (BLASLONG )buffer [bufferside ]);
247
+ // job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
238
248
#else
249
+ MB
239
250
for (i = mypos ; i < args -> nthreads ; i ++ )
240
- job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ] = (BLASLONG )buffer [bufferside ];
251
+ atomic_store_long (& job [mypos ].working [i ][CACHE_LINE_SIZE * bufferside ], (BLASLONG )buffer [bufferside ]);
252
+ // job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
241
253
#endif
242
254
243
- WMB ;
255
+ // WMB;
244
256
}
245
257
246
258
min_i = m_to - m_from ;
@@ -271,14 +283,21 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
271
283
for (xxx = range_n [current ], bufferside = 0 ; xxx < range_n [current + 1 ]; xxx += div_n , bufferside ++ ) {
272
284
273
285
/* thread has to wait */
274
- if (current != mypos ) while (job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] == 0 ) {YIELDING ;};
286
+ if (current != mypos )
287
+ do {
288
+ jw = atomic_load_long (& job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ]);
289
+ } while (jw == 0 );
290
+ MB ;
291
+
292
+ //while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
275
293
276
294
KERNEL_OPERATION (min_i , MIN (range_n [current + 1 ] - xxx , div_n ), k , alpha ,
277
295
sa , (FLOAT * )job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ],
278
296
c , lda , m_from , xxx );
279
297
280
298
if (m_from + min_i >= m_to ) {
281
- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
299
+ atomic_store_long (& job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ], job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 );
300
+ // job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
282
301
WMB ;
283
302
}
284
303
}
@@ -323,7 +342,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
323
342
c , lda , is , xxx );
324
343
325
344
if (is + min_i >= m_to ) {
326
- job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 ;
345
+ atomic_store_long (& job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ], job [current ].working [mypos ][CACHE_LINE_SIZE * bufferside ] &= 0 );
346
+ // job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
327
347
WMB ;
328
348
}
329
349
}
@@ -337,9 +357,18 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
337
357
338
358
for (i = 0 ; i < args -> nthreads ; i ++ ) {
339
359
if (i != mypos ) {
340
- for (xxx = 0 ; xxx < DIVIDE_RATE ; xxx ++ ) {
360
+ for (xxx = 0 ; xxx < DIVIDE_RATE ; xxx ++ )
361
+ #if 1
362
+ {
363
+ do {
364
+ jw = atomic_load_long (& job [mypos ].working [i ][CACHE_LINE_SIZE * xxx ]);
365
+ } while (jw );
366
+ MB ;
367
+ }
368
+ #else
341
369
while (job [mypos ].working [i ][CACHE_LINE_SIZE * xxx ] ) {YIELDING ;};
342
- }
370
+ #endif
371
+ // }
343
372
}
344
373
}
345
374
0 commit comments