compile

runtime function compiler TODO type check version (pick up d-nv impl)

@nogc nothrow
CUfunction
compile
(
scope string name
,
scope string args
,
scope string proc
,
CompileOpt opt = CompileOpt.init
)

Examples

1 import grain.testing : assertAllClose;
2 import grain.tensor : Tensor, Opt;
3 import grain.cuda : CuTensor, CuDevice;
4 import grain.random : normal_;
5 import grain.ops : copy;
6 import grain.dpp.cuda_driver : cuLaunchKernel, CUstream;
7 
8 auto cufun = compile(
9     "vectorAdd",
10     q{const float *A, const float *B, float *C, int numElements},
11     q{
12         int i = blockDim.x * blockIdx.x + threadIdx.x;
13         if (i < numElements) {
14             C[i] = A[i] + B[i];
15         }
16     });
17 
18 scope auto n = 50000;
19 auto ha = Tensor!(1, float)(n).normal_;
20 auto hb = Tensor!(1, float)(n).normal_;
21 
22 // auto da = ha.copy!"cuda";
23 // auto db = hb.copy!"cuda";
24 // auto dc = CuTensor!(1, float)(n);
25 
26 // int threadPerBlock = 256;
27 // int sharedMemBytes = 0;
28 // auto stream = CuDevice.get(dc.deviceId).stream;
29 // auto ps = [da.ptr, db.ptr, dc.ptr];
30 // scope void*[4] args = [
31 //     cast(void*) &ps[0],
32 //     cast(void*) &ps[1],
33 //     cast(void*) &ps[2],
34 //     cast(void*) &n
35 // ];
36 // void*[] config;
37 // // NOTE runtime api failed
38 // // import grain.dpp.cuda_runtime_api;
39 // // checkCuda(cudaLaunchKernel(
40 // //     cufun,
41 // //     // grid
42 // //     dim3(threadPerBlock, 1, 1),
43 // //     // block
44 // //     dim3((n + threadPerBlock - 1) / threadPerBlock, 1, 1),
45 // //     args.ptr,
46 // //     sharedMemBytes, stream));
47 
48 // // device api
49 // checkCuda(cuLaunchKernel(
50 //     cufun,
51 //     // grid
52 //     threadPerBlock, 1, 1,
53 //     // block
54 //     (n + threadPerBlock - 1) / threadPerBlock, 1, 1,
55 //     sharedMemBytes, cast(CUstream) stream, args.ptr, config.ptr));
56 
57 // auto hc = dc.copy!"cpu";
58 // assertAllClose(ha.asSlice + hb.asSlice, hc.asSlice);

Meta