[PR88370] acc_get_cuda_stream/acc_set_cuda_stream: acc_async_sync, acc_async_noval

Per my reading of the OpenACC specification (and as supported by secondary
documentation, such as code examples, or presentations), it's valid to call
"acc_get_cuda_stream"/"acc_set_cuda_stream" also with "acc_async_sync",
"acc_async_noval" arguments, not just with the nonnegative values as currently
implemented.

	libgomp/
	PR libgomp/88370
	* libgomp.texi (acc_get_current_cuda_context, acc_get_cuda_stream)
	(acc_set_cuda_stream): Clarify.
	* oacc-cuda.c (acc_get_cuda_stream, acc_set_cuda_stream): Use
	"async_valid_p".
	* plugin/plugin-nvptx.c (nvptx_set_cuda_stream): Refuse "async ==
	acc_async_sync".
	* testsuite/libgomp.oacc-c-c++-common/acc_set_cuda_stream-1.c: New file.
	* testsuite/libgomp.oacc-c-c++-common/async_queue-1.c: Likewise.
	* testsuite/libgomp.oacc-c-c++-common/lib-84.c: Update.
	* testsuite/libgomp.oacc-c-c++-common/lib-85.c: Likewise.

From-SVN: r267147
This commit is contained in:
Thomas Schwinge 2018-12-14 21:42:08 +01:00 committed by Thomas Schwinge
parent 5d390fd3ae
commit 18c247cc0b
8 changed files with 222 additions and 20 deletions

View File

@ -1,3 +1,17 @@
2018-12-14 Thomas Schwinge <thomas@codesourcery.com>
PR libgomp/88370
* libgomp.texi (acc_get_current_cuda_context, acc_get_cuda_stream)
(acc_set_cuda_stream): Clarify.
* oacc-cuda.c (acc_get_cuda_stream, acc_set_cuda_stream): Use
"async_valid_p".
* plugin/plugin-nvptx.c (nvptx_set_cuda_stream): Refuse "async ==
acc_async_sync".
* testsuite/libgomp.oacc-c-c++-common/acc_set_cuda_stream-1.c: New file.
* testsuite/libgomp.oacc-c-c++-common/async_queue-1.c: Likewise.
* testsuite/libgomp.oacc-c-c++-common/lib-84.c: Update.
* testsuite/libgomp.oacc-c-c++-common/lib-85.c: Likewise.
2018-12-14 Tom de Vries <tdevries@suse.de>
* testsuite/libgomp.c-c++-common/function-not-offloaded-aux.c: New test.

View File

@ -2768,7 +2768,7 @@ as used by the CUDA Runtime or Driver API's.
@item @emph{C/C++}:
@multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{acc_get_current_cuda_context(void);}
@item @emph{Prototype}: @tab @code{void *acc_get_current_cuda_context(void);}
@end multitable
@item @emph{Reference}:
@ -2782,12 +2782,12 @@ A.2.1.2.
@section @code{acc_get_cuda_stream} -- Get CUDA stream handle.
@table @asis
@item @emph{Description}
This function returns the CUDA stream handle. This handle is the same
as used by the CUDA Runtime or Driver API's.
This function returns the CUDA stream handle for the queue @var{async}.
This handle is the same as used by the CUDA Runtime or Driver API's.
@item @emph{C/C++}:
@multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{acc_get_cuda_stream(void);}
@item @emph{Prototype}: @tab @code{void *acc_get_cuda_stream(int async);}
@end multitable
@item @emph{Reference}:
@ -2802,11 +2802,16 @@ A.2.1.3.
@table @asis
@item @emph{Description}
This function associates the stream handle specified by @var{stream} with
the asynchronous value specified by @var{async}.
the queue @var{async}.
This cannot be used to change the stream handle associated with
@code{acc_async_sync}.
The return value is not specified.
@item @emph{C/C++}:
@multitable @columnfractions .20 .80
@item @emph{Prototype}: @tab @code{acc_set_cuda_stream(int async void *stream);}
@item @emph{Prototype}: @tab @code{int acc_set_cuda_stream(int async, void *stream);}
@end multitable
@item @emph{Reference}:

View File

@ -58,7 +58,7 @@ acc_get_cuda_stream (int async)
{
struct goacc_thread *thr = goacc_thread ();
if (!async_valid_stream_id_p (async))
if (!async_valid_p (async))
return NULL;
if (thr && thr->dev && thr->dev->openacc.cuda.get_stream_func)
@ -72,7 +72,7 @@ acc_set_cuda_stream (int async, void *stream)
{
struct goacc_thread *thr;
if (!async_valid_stream_id_p (async) || stream == NULL)
if (!async_valid_p (async) || stream == NULL)
return 0;
goacc_lazy_initialize ();

View File

@ -1753,8 +1753,14 @@ nvptx_set_cuda_stream (int async, void *stream)
pthread_t self = pthread_self ();
struct nvptx_thread *nvthd = nvptx_thread ();
if (async < 0)
GOMP_PLUGIN_fatal ("bad async %d", async);
/* Due to the "null_stream" usage for "acc_async_sync", this cannot be used
to change the stream handle associated with "acc_async_sync". */
if (async == acc_async_sync)
{
GOMP_PLUGIN_debug (0, "Refusing request to set CUDA stream associated"
" with \"acc_async_sync\"\n");
return 0;
}
pthread_mutex_lock (&nvthd->ptx_dev->stream_lock);

View File

@ -0,0 +1,42 @@
/* Verify expected nvptx plugin behavior for "acc_set_cuda_stream" for
"acc_async_sync". */
/* { dg-do run { target openacc_nvidia_accel_selected } } */
/* { dg-set-target-env-var GOMP_DEBUG "1" } */
#undef NDEBUG
#include <assert.h>
#include <openacc.h>
int main(void)
{
int async = 42;
/* Initialize. */
#pragma acc parallel async(acc_async_sync)
;
#pragma acc parallel async(async)
;
#pragma acc wait
void *cuda_stream_sync = acc_get_cuda_stream (acc_async_sync);
assert (cuda_stream_sync == NULL);
void *cuda_stream_async = acc_get_cuda_stream (async);
assert (cuda_stream_async != NULL);
int ret = acc_set_cuda_stream (acc_async_sync, cuda_stream_async);
assert (ret == 0);
void *cuda_stream_sync_ = acc_get_cuda_stream (acc_async_sync);
assert (cuda_stream_sync_ == cuda_stream_sync);
void *cuda_stream_async_ = acc_get_cuda_stream (async);
assert (cuda_stream_async_ == cuda_stream_async);
#pragma acc parallel async(acc_async_sync)
;
#pragma acc parallel async(async)
;
#pragma acc wait
return 0;
}
/* { dg-output "Refusing request to set CUDA stream associated with \"acc_async_sync\"" } */

View File

@ -0,0 +1,97 @@
/* Test mapping of async values to specific underlying queues. */
#undef NDEBUG
#include <assert.h>
#include <openacc.h>
/* This is implemented in terms of the "acc_get_cuda_stream" interface. */
struct
{
int async;
void *cuda_stream;
} queues[] = { { acc_async_sync, NULL },
{ acc_async_noval, NULL },
{ 0, NULL },
{ 1, NULL },
{ 2, NULL },
{ 36, NULL },
{ 1982, NULL } };
const size_t queues_n = sizeof queues / sizeof queues[0];
int main(void)
{
/* Explicitly initialize: it's not clear whether the following OpenACC
runtime library calls implicitly initialize;
<https://github.com/OpenACC/openacc-spec/issues/102>. */
acc_device_t d;
#if defined ACC_DEVICE_TYPE_nvidia
d = acc_device_nvidia;
#elif defined ACC_DEVICE_TYPE_host
d = acc_device_host;
#else
# error Not ported to this ACC_DEVICE_TYPE
#endif
acc_init (d);
for (size_t i = 0; i < queues_n; ++i)
{
/* Before actually being used, there are all NULL. */
queues[i].cuda_stream = acc_get_cuda_stream (queues[i].async);
assert (queues[i].cuda_stream == NULL);
}
for (size_t i = 0; i < queues_n; ++i)
{
/* Use the queue to initialize it. */
#pragma acc parallel async(queues[i].async)
;
#pragma acc wait
/* Verify CUDA stream used. */
queues[i].cuda_stream = acc_get_cuda_stream (queues[i].async);
#if defined ACC_DEVICE_TYPE_nvidia
/* "acc_async_sync" maps to the NULL CUDA default stream. */
if (queues[i].async == acc_async_sync)
assert (queues[i].cuda_stream == NULL);
else
assert (queues[i].cuda_stream != NULL);
#elif defined ACC_DEVICE_TYPE_host
/* For "acc_device_host" there are no CUDA streams. */
assert (queues[i].cuda_stream == NULL);
#else
# error Not ported to this ACC_DEVICE_TYPE
#endif
}
/* Verify same results. */
for (size_t i = 0; i < queues_n; ++i)
{
void *cuda_stream;
cuda_stream = acc_get_cuda_stream (queues[i].async);
assert (cuda_stream == queues[i].cuda_stream);
#pragma acc parallel async(queues[i].async)
;
#pragma acc wait
cuda_stream = acc_get_cuda_stream (queues[i].async);
assert (cuda_stream == queues[i].cuda_stream);
}
/* Verify individual underlying queues are all different. */
for (size_t i = 0; i < queues_n; ++i)
{
if (queues[i].cuda_stream == NULL)
continue;
for (size_t j = i + 1; j < queues_n; ++j)
{
if (queues[j].cuda_stream == NULL)
continue;
assert (queues[j].cuda_stream != queues[i].cuda_stream);
}
}
return 0;
}

View File

@ -7,6 +7,14 @@
#include <openacc.h>
#include <cuda.h>
#if !defined __cplusplus
# undef static_assert
# define static_assert _Static_assert
#endif
static_assert (acc_async_sync == -2, "acc_async_sync?");
static_assert (acc_async_noval == -1, "acc_async_noval?");
int
main (int argc, char **argv)
{
@ -20,9 +28,11 @@ main (int argc, char **argv)
(void) acc_get_device_num (acc_device_nvidia);
streams = (CUstream *) malloc (N * sizeof (void *));
streams = (CUstream *) malloc ((2 + N) * sizeof (void *));
streams += 2;
/* "streams[i]" is valid for i in [acc_async_sync..N). */
for (i = 0; i < N; i++)
for (i = acc_async_sync; i < N; i++)
{
streams[i] = (CUstream) acc_get_cuda_stream (i);
if (streams[i] != NULL)
@ -35,11 +45,20 @@ main (int argc, char **argv)
abort ();
}
if (!acc_set_cuda_stream (i, streams[i]))
abort ();
int ret = acc_set_cuda_stream (i, streams[i]);
if (i == acc_async_sync)
{
if (ret == 1)
abort ();
}
else
{
if (ret != 1)
abort ();
}
}
for (i = 0; i < N; i++)
for (i = acc_async_sync; i < N; i++)
{
int j;
int cnt;
@ -48,7 +67,7 @@ main (int argc, char **argv)
s = streams[i];
for (j = 0; j < N; j++)
for (j = acc_async_sync; j < N; j++)
{
if (s == streams[j])
cnt++;

View File

@ -7,6 +7,14 @@
#include <stdio.h>
#include <cuda.h>
#if !defined __cplusplus
# undef static_assert
# define static_assert _Static_assert
#endif
static_assert (acc_async_sync == -2, "acc_async_sync?");
static_assert (acc_async_noval == -1, "acc_async_noval?");
int
main (int argc, char **argv)
{
@ -20,9 +28,11 @@ main (int argc, char **argv)
(void) acc_get_device_num (acc_device_nvidia);
streams = (CUstream *) malloc (N * sizeof (void *));
streams = (CUstream *) malloc ((2 + N) * sizeof (void *));
streams += 2;
/* "streams[i]" is valid for i in [acc_async_sync..N). */
for (i = 0; i < N; i++)
for (i = acc_async_sync; i < N; i++)
{
streams[i] = (CUstream) acc_get_cuda_stream (i);
if (streams[i] != NULL)
@ -35,8 +45,17 @@ main (int argc, char **argv)
abort ();
}
if (!acc_set_cuda_stream (i, streams[i]))
abort ();
int ret = acc_set_cuda_stream (i, streams[i]);
if (i == acc_async_sync)
{
if (ret == 1)
abort ();
}
else
{
if (ret != 1)
abort ();
}
}
s = NULL;