#!/usr/bin/env python # coding: utf-8 # In[3]: get_ipython().system('nvidia-smi') # In[16]: get_ipython().system('LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 nvidia-smi') # In[17]: get_ipython().run_cell_magic('file', 'hello.cu', '#include \n#include \n#include \n \n// CUDA kernel. Each thread takes care of one element of c\n__global__ void vecAdd(double *a, double *b, double *c, int n)\n{\n // Get our global thread ID\n int id = blockIdx.x*blockDim.x+threadIdx.x;\n \n // Make sure we do not go out of bounds\n if (id < n)\n c[id] = a[id] + b[id];\n}\n \nint main( int argc, char* argv[] )\n{\n // Size of vectors\n int n = 100000;\n \n // Host input vectors\n double *h_a;\n double *h_b;\n //Host output vector\n double *h_c;\n \n // Device input vectors\n double *d_a;\n double *d_b;\n //Device output vector\n double *d_c;\n \n // Size, in bytes, of each vector\n size_t bytes = n*sizeof(double);\n \n // Allocate memory for each vector on host\n h_a = (double*)malloc(bytes);\n h_b = (double*)malloc(bytes);\n h_c = (double*)malloc(bytes);\n \n // Allocate memory for each vector on GPU\n cudaMalloc(&d_a, bytes);\n cudaMalloc(&d_b, bytes);\n cudaMalloc(&d_c, bytes);\n \n int i;\n // Initialize vectors on host\n for( i = 0; i < n; i++ ) {\n h_a[i] = sin(i)*sin(i);\n h_b[i] = cos(i)*cos(i);\n }\n \n // Copy host vectors to device\n cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);\n cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);\n \n int blockSize, gridSize;\n \n // Number of threads in each thread block\n blockSize = 1024;\n \n // Number of thread blocks in grid\n gridSize = (int)ceil((float)n/blockSize);\n \n // Execute the kernel\n vecAdd<<>>(d_a, d_b, d_c, n);\n \n // Copy array back to host\n cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );\n \n // Sum up vector c and print result divided by n, this should equal 1 within error\n double sum = 0;\n for(i=0; i