/*
 *     Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
 *
 * NVIDIA CORPORATION and its licensors retain all intellectual property
 * and proprietary rights in and to this software, related documentation
 * and any modifications thereto.  Any use, reproduction, disclosure or
 * distribution of this software and related documentation without an express
 * license agreement from NVIDIA CORPORATION is strictly prohibited.
 *
 *         THIS CODE AND INFORMATION ARE PROVIDED "AS IS" WITHOUT
 *  WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT
 *  NOT LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR
 *  FITNESS FOR A PARTICULAR PURPOSE.
 */

#include <stdint.h>
#include "cuda_runtime.h"

#include "nvshmem.h"
#include "nvshmemx.h"

#ifndef NVHPC_NVSHMEM_DEV_SIG
#define NVHPC_NVSHMEM_DEV_SIG __device__ static __inline__
#endif

/* ------------------------------------------------------------------------- */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_putmem(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_putmem(dest, src, nelems, pe);
  return;
}

/* int8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int8_put(int8_t *dest, const int8_t *src, size_t nelems, int pe)
{
  nvshmem_int8_put(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int8_p(int8_t *dest, int8_t src, int pe)
{
  nvshmem_int8_p(dest, src, pe);
  return;
}

/* int16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int16_put(int16_t *dest, const int16_t *src, size_t nelems, int pe)
{
  nvshmem_int16_put(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int16_p(int16_t *dest, int16_t src, int pe)
{
  nvshmem_int16_p(dest, src, pe);
  return;
}

/* int32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int32_put(int32_t *dest, const int32_t *src, size_t nelems, int pe)
{
  nvshmem_int32_put(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int32_p(int32_t *dest, int32_t src, int pe)
{
  nvshmem_int32_p(dest, src, pe);
  return;
}

/* int64 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int64_put(int64_t *dest, const int64_t *src, size_t nelems, int pe)
{
  nvshmem_int64_put(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int64_p(int64_t *dest, int64_t src, int pe)
{
  nvshmem_int64_p(dest, src, pe);
  return;
}

/* float, real(4) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_float_put(float *dest, const float *src, size_t nelems, int pe)
{
  nvshmem_float_put(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_float_p(float *dest, float src, int pe)
{
  nvshmem_float_p(dest, src, pe);
  return;
}

/* double, real(8) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_double_put(double *dest, const double *src, size_t nelems, int pe)
{
  nvshmem_double_put(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_double_p(double *dest, double src, int pe)
{
  nvshmem_double_p(dest, src, pe);
  return;
}

/* put8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_put8(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_put8(dest, src, nelems, pe);
  return;
}

/* put16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_put16(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_put16(dest, src, nelems, pe);
  return;
}

/* put32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_put32(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_put32(dest, src, nelems, pe);
  return;
}

/* put64, complex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_put64(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_put64(dest, src, nelems, pe);
  return;
}

/* put128, dcomplex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_put128(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_put128(dest, src, nelems, pe);
  return;
}

/* ------------------------------------------------------------------------- */
/* put_block */

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_putmem_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_putmem_block(dest, src, nelems, pe);
  return;
}

/* int8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int8_put_block(int8_t *dest, const int8_t *src, size_t nelems, int pe)
{
  nvshmemx_int8_put_block(dest, src, nelems, pe);
  return;
}

/* int16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int16_put_block(int16_t *dest, const int16_t *src, size_t nelems, int pe)
{
  nvshmemx_int16_put_block(dest, src, nelems, pe);
  return;
}

/* int32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int32_put_block(int32_t *dest, const int32_t *src, size_t nelems, int pe)
{
  nvshmemx_int32_put_block(dest, src, nelems, pe);
  return;
}

/* int64 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int64_put_block(int64_t *dest, const int64_t *src, size_t nelems, int pe)
{
  nvshmemx_int64_put_block(dest, src, nelems, pe);
  return;
}

/* float, real(4) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_float_put_block(float *dest, const float *src, size_t nelems, int pe)
{
  nvshmemx_float_put_block(dest, src, nelems, pe);
  return;
}

/* double, real(8) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_double_put_block(double *dest, const double *src, size_t nelems, int pe)
{
  nvshmemx_double_put_block(dest, src, nelems, pe);
  return;
}

/* put8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_put8_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_put8_block(dest, src, nelems, pe);
  return;
}

/* put16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_put16_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_put16_block(dest, src, nelems, pe);
  return;
}

/* put32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_put32_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_put32_block(dest, src, nelems, pe);
  return;
}

/* put64, complex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_put64_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_put64_block(dest, src, nelems, pe);
  return;
}

/* put128, dcomplex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_put128_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_put128_block(dest, src, nelems, pe);
  return;
}

/* ------------------------------------------------------------------------- */
/* put_warp */

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_putmem_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_putmem_warp(dest, src, nelems, pe);
  return;
}

/* int8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int8_put_warp(int8_t *dest, const int8_t *src, size_t nelems, int pe)
{
  nvshmemx_int8_put_warp(dest, src, nelems, pe);
  return;
}

/* int16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int16_put_warp(int16_t *dest, const int16_t *src, size_t nelems, int pe)
{
  nvshmemx_int16_put_warp(dest, src, nelems, pe);
  return;
}

/* int32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int32_put_warp(int32_t *dest, const int32_t *src, size_t nelems, int pe)
{
  nvshmemx_int32_put_warp(dest, src, nelems, pe);
  return;
}

/* int64 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int64_put_warp(int64_t *dest, const int64_t *src, size_t nelems, int pe)
{
  nvshmemx_int64_put_warp(dest, src, nelems, pe);
  return;
}

/* float, real(4) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_float_put_warp(float *dest, const float *src, size_t nelems, int pe)
{
  nvshmemx_float_put_warp(dest, src, nelems, pe);
  return;
}

/* double, real(8) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_double_put_warp(double *dest, const double *src, size_t nelems, int pe)
{
  nvshmemx_double_put_warp(dest, src, nelems, pe);
  return;
}

/* put8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_put8_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_put8_warp(dest, src, nelems, pe);
  return;
}

/* put16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_put16_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_put16_warp(dest, src, nelems, pe);
  return;
}

/* put32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_put32_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_put32_warp(dest, src, nelems, pe);
  return;
}

/* put64, complex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_put64_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_put64_warp(dest, src, nelems, pe);
  return;
}

/* put128, dcomplex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_put128_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_put128_warp(dest, src, nelems, pe);
  return;
}

/* ------------------------------------------------------------------------- */

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_getmem(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_getmem(dest, src, nelems, pe);
  return;
}

/* int8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int8_get(int8_t *dest, const int8_t *src, size_t nelems, int pe)
{
  nvshmem_int8_get(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG int8_t
__nvhpc_cudalib_nvshmem_int8_g(const int8_t *src, int pe)
{
  return nvshmem_int8_g(src, pe);
}

/* int16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int16_get(int16_t *dest, const int16_t *src, size_t nelems, int pe)
{
  nvshmem_int16_get(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG int16_t
__nvhpc_cudalib_nvshmem_int16_g(const int16_t *src, int pe)
{
  return nvshmem_int16_g(src, pe);
}

/* int32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int32_get(int32_t *dest, const int32_t *src, size_t nelems, int pe)
{
  nvshmem_int32_get(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG int32_t
__nvhpc_cudalib_nvshmem_int32_g(const int32_t *src, int pe)
{
  return nvshmem_int32_g(src, pe);
}

/* int64 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_int64_get(int64_t *dest, const int64_t *src, size_t nelems, int pe)
{
  nvshmem_int64_get(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG int64_t
__nvhpc_cudalib_nvshmem_int64_g(const int64_t *src, int pe)
{
  return nvshmem_int64_g(src, pe);
}

/* float, real(4) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_float_get(float *dest, const float *src, size_t nelems, int pe)
{
  nvshmem_float_get(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG float
__nvhpc_cudalib_nvshmem_float_g(const float *src, int pe)
{
  return nvshmem_float_g(src, pe);
}

/* double, real(8) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_double_get(double *dest, const double *src, size_t nelems, int pe)
{
  nvshmem_double_get(dest, src, nelems, pe);
  return;
}

NVHPC_NVSHMEM_DEV_SIG double
__nvhpc_cudalib_nvshmem_double_g(const double *src, int pe)
{
  return nvshmem_double_g(src, pe);
}

/* get8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_get8(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_get8(dest, src, nelems, pe);
  return;
}

/* get16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_get16(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_get16(dest, src, nelems, pe);
  return;
}

/* get32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_get32(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_get32(dest, src, nelems, pe);
  return;
}

/* get64, complex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_get64(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_get64(dest, src, nelems, pe);
  return;
}

/* get128, dcomplex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_get128(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmem_get128(dest, src, nelems, pe);
  return;
}

/* ------------------------------------------------------------------------- */
/* get_block */

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_getmem_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_getmem_block(dest, src, nelems, pe);
  return;
}

/* int8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int8_get_block(int8_t *dest, const int8_t *src, size_t nelems, int pe)
{
  nvshmemx_int8_get_block(dest, src, nelems, pe);
  return;
}

/* int16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int16_get_block(int16_t *dest, const int16_t *src, size_t nelems, int pe)
{
  nvshmemx_int16_get_block(dest, src, nelems, pe);
  return;
}

/* int32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int32_get_block(int32_t *dest, const int32_t *src, size_t nelems, int pe)
{
  nvshmemx_int32_get_block(dest, src, nelems, pe);
  return;
}

/* int64 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int64_get_block(int64_t *dest, const int64_t *src, size_t nelems, int pe)
{
  nvshmemx_int64_get_block(dest, src, nelems, pe);
  return;
}

/* float, real(4) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_float_get_block(float *dest, const float *src, size_t nelems, int pe)
{
  nvshmemx_float_get_block(dest, src, nelems, pe);
  return;
}

/* double, real(8) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_double_get_block(double *dest, const double *src, size_t nelems, int pe)
{
  nvshmemx_double_get_block(dest, src, nelems, pe);
  return;
}

/* get8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_get8_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_get8_block(dest, src, nelems, pe);
  return;
}

/* get16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_get16_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_get16_block(dest, src, nelems, pe);
  return;
}

/* get32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_get32_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_get32_block(dest, src, nelems, pe);
  return;
}

/* get64, complex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_get64_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_get64_block(dest, src, nelems, pe);
  return;
}

/* get128, dcomplex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_get128_block(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_get128_block(dest, src, nelems, pe);
  return;
}

/* ------------------------------------------------------------------------- */
/* get_warp */

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_getmem_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_getmem_warp(dest, src, nelems, pe);
  return;
}

/* int8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int8_get_warp(int8_t *dest, const int8_t *src, size_t nelems, int pe)
{
  nvshmemx_int8_get_warp(dest, src, nelems, pe);
  return;
}

/* int16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int16_get_warp(int16_t *dest, const int16_t *src, size_t nelems, int pe)
{
  nvshmemx_int16_get_warp(dest, src, nelems, pe);
  return;
}

/* int32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int32_get_warp(int32_t *dest, const int32_t *src, size_t nelems, int pe)
{
  nvshmemx_int32_get_warp(dest, src, nelems, pe);
  return;
}

/* int64 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_int64_get_warp(int64_t *dest, const int64_t *src, size_t nelems, int pe)
{
  nvshmemx_int64_get_warp(dest, src, nelems, pe);
  return;
}

/* float, real(4) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_float_get_warp(float *dest, const float *src, size_t nelems, int pe)
{
  nvshmemx_float_get_warp(dest, src, nelems, pe);
  return;
}

/* double, real(8) */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_double_get_warp(double *dest, const double *src, size_t nelems, int pe)
{
  nvshmemx_double_get_warp(dest, src, nelems, pe);
  return;
}

/* get8 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_get8_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_get8_warp(dest, src, nelems, pe);
  return;
}

/* get16 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_get16_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_get16_warp(dest, src, nelems, pe);
  return;
}

/* get32 */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_get32_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_get32_warp(dest, src, nelems, pe);
  return;
}

/* get64, complex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_get64_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_get64_warp(dest, src, nelems, pe);
  return;
}

/* get128, dcomplex */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmemx_get128_warp(void *dest, const void *src, size_t nelems, int pe)
{
  nvshmemx_get128_warp(dest, src, nelems, pe);
  return;
}

/* ------------------------------------------------------------------------- */
/* point-to-point synchronization */

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_nvshmem_int32_wait_until(int32_t *ivar, int cmp, int32_t value)
{
  nvshmem_int32_wait_until(ivar, cmp, value);
  return;
}

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_nvshmem_int64_wait_until(int64_t *ivar, int cmp, int64_t value)
{
  nvshmem_int64_wait_until(ivar, cmp, value);
  return;
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_nvshmem_int32_wait_until_all(int32_t *ivar, size_t nelems, const int *status,
                                                              int cmp, int32_t value)
{
  nvshmem_int32_wait_until_all(ivar, nelems, status, cmp, value);
  return;
}

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_nvshmem_int64_wait_until_all(int64_t *ivar, size_t nelems, const int *status,
                                                              int cmp, int64_t value)
{
  nvshmem_int64_wait_until_all(ivar, nelems, status, cmp, value);
  return;
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int32_wait_until_any(int32_t *ivar, size_t nelems, const int *status,
                                                              int cmp, int32_t value)
{
  return nvshmem_int32_wait_until_any(ivar, nelems, status, cmp, value);
}

NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int64_wait_until_any(int64_t *ivar, size_t nelems, const int *status,
                                                              int cmp, int64_t value)
{
  return nvshmem_int64_wait_until_any(ivar, nelems, status, cmp, value);
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int32_wait_until_some(int32_t *ivar, size_t nelems, size_t *indices,
                                           const int *status, int cmp, int32_t value)
{
  return nvshmem_int32_wait_until_some(ivar, nelems, indices, status, cmp, value);
}

NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int64_wait_until_some(int64_t *ivar, size_t nelems, size_t *indices,
                                           const int *status, int cmp, int64_t value)
{
  return nvshmem_int64_wait_until_some(ivar, nelems, indices, status, cmp, value);
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_nvshmem_int32_wait_until_all_vector(int32_t *ivar, size_t nelems,
                                const int *status, int cmp, int32_t *values)
{
  nvshmem_int32_wait_until_all_vector(ivar, nelems, status, cmp, values);
  return;
}

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_nvshmem_int64_wait_until_all_vector(int64_t *ivar, size_t nelems,
                                const int *status, int cmp, int64_t *values)
{
  nvshmem_int64_wait_until_all_vector(ivar, nelems, status, cmp, values);
  return;
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int32_wait_until_any_vector(int32_t *ivar, size_t nelems,
                                const int *status, int cmp, int32_t *values)
{
  return nvshmem_int32_wait_until_any_vector(ivar, nelems, status, cmp, values);
}

NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int64_wait_until_any_vector(int64_t *ivar, size_t nelems,
                                const int *status, int cmp, int64_t *values)
{
  return nvshmem_int64_wait_until_any_vector(ivar, nelems, status, cmp, values);
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int32_wait_until_some_vector(int32_t *ivar, size_t nelems, size_t *indices,
                                           const int *status, int cmp, int32_t *values)
{
  return nvshmem_int32_wait_until_some_vector(ivar, nelems, indices, status, cmp, values);
}

NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int64_wait_until_some_vector(int64_t *ivar, size_t nelems, size_t *indices,
                                           const int *status, int cmp, int64_t *values)
{
  return nvshmem_int64_wait_until_some_vector(ivar, nelems, indices, status, cmp, values);
}

/* ------------------------------------------- */

NVHPC_NVSHMEM_DEV_SIG int
__nvhpc_nvshmem_int32_test(int32_t *ivar, int cmp, int32_t value)
{
  return nvshmem_int32_test(ivar, cmp, value);
}

NVHPC_NVSHMEM_DEV_SIG int
__nvhpc_nvshmem_int64_test(int64_t *ivar, int cmp, int64_t value)
{
  return nvshmem_int64_test(ivar, cmp, value);
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG int
__nvhpc_nvshmem_int32_test_all(int32_t *ivar, size_t nelems, const int *status,
                                                         int cmp, int32_t value)
{
  return nvshmem_int32_test_all(ivar, nelems, status, cmp, value);
}

NVHPC_NVSHMEM_DEV_SIG int
__nvhpc_nvshmem_int64_test_all(int64_t *ivar, size_t nelems, const int *status,
                                                         int cmp, int64_t value)
{
  return nvshmem_int64_test_all(ivar, nelems, status, cmp, value);
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int32_test_any(int32_t *ivar, size_t nelems, const int *status,
                                                         int cmp, int32_t value)
{
  return nvshmem_int32_test_any(ivar, nelems, status, cmp, value);
}

NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int64_test_any(int64_t *ivar, size_t nelems, const int *status,
                                                         int cmp, int64_t value)
{
  return nvshmem_int64_test_any(ivar, nelems, status, cmp, value);
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int32_test_some(int32_t *ivar, size_t nelems, size_t *indices,
                                     const int *status, int cmp, int32_t value)
{
  return nvshmem_int32_test_some(ivar, nelems, indices, status, cmp, value);
}

NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int64_test_some(int64_t *ivar, size_t nelems, size_t *indices,
                                     const int *status, int cmp, int64_t value)
{
  return nvshmem_int64_test_some(ivar, nelems, indices, status, cmp, value);
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG int
__nvhpc_nvshmem_int32_test_all_vector(int32_t *ivar, size_t nelems, const int *status,
                                                         int cmp, int32_t *values)
{
  return nvshmem_int32_test_all_vector(ivar, nelems, status, cmp, values);
}

NVHPC_NVSHMEM_DEV_SIG int
__nvhpc_nvshmem_int64_test_all_vector(int64_t *ivar, size_t nelems, const int *status,
                                                         int cmp, int64_t *values)
{
  return nvshmem_int64_test_all_vector(ivar, nelems, status, cmp, values);
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int32_test_any_vector(int32_t *ivar, size_t nelems, const int *status,
                                                         int cmp, int32_t *values)
{
  return nvshmem_int32_test_any_vector(ivar, nelems, status, cmp, values);
}

NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int64_test_any_vector(int64_t *ivar, size_t nelems, const int *status,
                                                         int cmp, int64_t *values)
{
  return nvshmem_int64_test_any_vector(ivar, nelems, status, cmp, values);
}

/* ----------------------- */
NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int32_test_some_vector(int32_t *ivar, size_t nelems, size_t *indices,
                                         const int *status, int cmp, int32_t *values)
{
  return nvshmem_int32_test_some_vector(ivar, nelems, indices, status, cmp, values);
}

NVHPC_NVSHMEM_DEV_SIG size_t
__nvhpc_nvshmem_int64_test_some_vector(int64_t *ivar, size_t nelems, size_t *indices,
                                         const int *status, int cmp, int64_t *values)
{
  return nvshmem_int64_test_some_vector(ivar, nelems, indices, status, cmp, values);
}

/* ------------------------------------------------------------------------- */
/* memory ordering */

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_quiet()
{
  nvshmem_quiet();
  return;
}

NVHPC_NVSHMEM_DEV_SIG void
__nvhpc_cudalib_nvshmem_fence()
{
  nvshmem_fence();
  return;
}


/* Just a test */
/*
NVHPC_NVSHMEM_DEV_SIG int
__nvhpc_cudalib_nvshmemx_float_sum_reduce_block(int team, float *dest, const float *src, size_t nreduce)
{
  return nvshmemx_float_sum_reduce_block(team, dest, src, (int) nreduce);
}

NVHPC_NVSHMEM_DEV_SIG int
__nvhpc_cudalib_nvshmemx_double_sum_reduce_block(int team, double *dest, const double *src, size_t nreduce)
{
  return nvshmemx_double_sum_reduce_block(team, dest, src, (int) nreduce);
}

NVHPC_NVSHMEM_DEV_SIG int
__nvhpc_cudalib_nvshmemx_float_sum_reduce_warp(int team, float *dest, const float *src, size_t nreduce)
{
  return nvshmemx_float_sum_reduce_warp(team, dest, src, (int) nreduce);
}

NVHPC_NVSHMEM_DEV_SIG int
__nvhpc_cudalib_nvshmemx_double_sum_reduce_warp(int team, double *dest, const double *src, size_t nreduce)
{
  return nvshmemx_double_sum_reduce_warp(team, dest, src, (int) nreduce);
}
*/
