17#include <cuda_runtime.h> 
   19#include "arcane/utils/PlatformUtils.h" 
   20#include "arcane/utils/NotSupportedException.h" 
   21#include "arcane/utils/Real3.h" 
   22#include "arcane/utils/NumArray.h" 
   25#include "arcane/core/Item.h" 
   28#include "arcane/accelerator/core/Runner.h" 
   29#include "arcane/accelerator/core/RunQueue.h" 
   31#include "arcane/accelerator/cuda/CudaAccelerator.h" 
   34#include <cooperative_groups.h> 
   35namespace cg = cooperative_groups;
 
   41__global__ 
void MyVecAdd3(
double* a, 
double* b, 
double* out, 
int nb_value)
 
   43  int i = blockDim.x * blockIdx.x + threadIdx.x;
 
   44  cg::grid_group this_grid_group = cg::this_grid();
 
   47  this_grid_group.sync();
 
   50    printf(
"A=%d %lf %lf %lf grid_size=%llu \n", i, a[i], b[i], out[i], this_grid_group.size());
 
   54extern "C" void arcaneTestCooperativeLaunch()
 
   56  std::cout << 
"Test Cooperative Launch\n";
 
   57  constexpr int vsize = 2000;
 
   58  std::vector<double> a(vsize);
 
   59  std::vector<double> b(vsize);
 
   60  std::vector<double> out(vsize);
 
   61  for (
size_t i = 0; i < vsize; ++i) {
 
   62    a[i] = (double)(i + 1);
 
   63    b[i] = (double)(i * i + 1);
 
   66  size_t mem_size = vsize * 
sizeof(double);
 
   67  double* d_a = 
nullptr;
 
   68  cudaMalloc(&d_a, mem_size);
 
   69  double* d_b = 
nullptr;
 
   70  cudaMalloc(&d_b, mem_size);
 
   71  double* d_out = 
nullptr;
 
   72  cudaMalloc(&d_out, mem_size);
 
   74  cudaMemcpy(d_a, a.data(), mem_size, cudaMemcpyHostToDevice);
 
   75  cudaMemcpy(d_b, b.data(), mem_size, cudaMemcpyHostToDevice);
 
   76  int threadsPerBlock = 256;
 
   77  int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
 
   78  std::cout << 
"CALLING kernel tpb=" << threadsPerBlock << 
" bpg=" << blocksPerGrid << 
"\n";
 
   80  void* args[] = { &d_a, &d_b, &d_out, &nb_value };
 
   81  const void* func_ptr = 
reinterpret_cast<const void*
>(&MyVecAdd3);
 
   82  ARCANE_CHECK_CUDA(cudaLaunchCooperativeKernel(func_ptr, dim3(blocksPerGrid), dim3(threadsPerBlock), args, 0, 0));
 
   83  ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
 
   84  ARCANE_CHECK_CUDA(cudaMemcpy(out.data(), d_out, mem_size, cudaMemcpyDeviceToHost));
 
   85  for (
size_t i = 0; i < 10; ++i)
 
   86    std::cout << 
"V=" << out[i] << 
"\n";
 
Fonctions mathématiques diverses.
Fonctions de gestion mémoire et des allocateurs.
Types et macros pour gérer les boucles sur les accélérateurs.
Espace de nom pour l'utilisation des accélérateurs.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
Espace de nom de Arccore.