diff --git a/libgomp/config/gcn/bar.c b/libgomp/config/gcn/bar.c index 02fd19710d43..a21529a624b6 100644 --- a/libgomp/config/gcn/bar.c +++ b/libgomp/config/gcn/bar.c @@ -43,7 +43,8 @@ gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) __atomic_store_n (&bar->generation, bar->generation + BAR_INCR, MEMMODEL_RELAXED); } - asm ("s_barrier" ::: "memory"); + if (bar->total > 1) + asm ("s_barrier" ::: "memory"); } void @@ -71,7 +72,8 @@ gomp_barrier_wait_last (gomp_barrier_t *bar) void gomp_team_barrier_wake (gomp_barrier_t *bar, int count) { - asm ("s_barrier" ::: "memory"); + if (bar->total > 1) + asm ("s_barrier" ::: "memory"); } void @@ -97,7 +99,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) state &= ~BAR_CANCELLED; state += BAR_INCR - BAR_WAS_LAST; __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED); - asm ("s_barrier" ::: "memory"); + if (bar->total > 1) + asm ("s_barrier" ::: "memory"); return; } } @@ -172,7 +175,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, { state += BAR_INCR - BAR_WAS_LAST; __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED); - asm ("s_barrier" ::: "memory"); + if (bar->total > 1) + asm ("s_barrier" ::: "memory"); return false; } } @@ -195,7 +199,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, abort(); } - asm ("s_barrier" ::: "memory"); + if (bar->total > 1) + asm ("s_barrier" ::: "memory"); gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED); if (__builtin_expect (gen & BAR_CANCELLED, 0)) return true; diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c index 125ca3e49ecf..1116561d9315 100644 --- a/libgomp/config/nvptx/bar.c +++ b/libgomp/config/nvptx/bar.c @@ -41,7 +41,8 @@ gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) __atomic_store_n (&bar->generation, bar->generation + BAR_INCR, MEMMODEL_RELEASE); } - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); } void @@ -69,7 +70,8 @@ gomp_barrier_wait_last (gomp_barrier_t *bar) void gomp_team_barrier_wake (gomp_barrier_t *bar, int count) { - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); } void @@ -95,7 +97,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) state &= ~BAR_CANCELLED; state += BAR_INCR - BAR_WAS_LAST; __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); return; } } @@ -104,7 +107,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) state &= ~BAR_CANCELLED; do { - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); if (__builtin_expect (gen & BAR_TASK_PENDING, 0)) { @@ -158,7 +162,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, { state += BAR_INCR - BAR_WAS_LAST; __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); return false; } } @@ -169,7 +174,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, generation = state; do { - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); if (__builtin_expect (gen & BAR_CANCELLED, 0)) return true; diff --git a/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c b/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c new file mode 100644 index 000000000000..e777271dde1d --- /dev/null +++ b/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c @@ -0,0 +1,31 @@ +/* Ensure that nested parallel regions work even when the number of loop + iterations is not divisible by the number of threads. */ + +#include + +int main() { + int A[30][40], B[30][40]; + size_t n = 30; + + for (size_t i = 0; i < 30; ++i) + for (size_t j = 0; j < 40; ++j) + A[i][j] = 42; + +#pragma omp target map(A[0:30][0:40], B[0:30][0:40]) + { +#pragma omp parallel for num_threads(8) + for (size_t i = 0; i < n; ++i) + { +#pragma omp parallel for + for (size_t j = 0; j < n; ++j) + { + B[i][j] = A[i][j]; + } + } + } + +for (size_t i = 0; i < n; ++i) + for (size_t j = 0; j < n; ++j) + if (B[i][j] != 42) + abort (); +}