#!/usr/bin/env python
# coding: utf-8

# In[3]:


get_ipython().system('nvidia-smi')


# In[16]:


get_ipython().system('LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 nvidia-smi')


# In[17]:


get_ipython().run_cell_magic('file', 'hello.cu', '#include <stdio.h>\n#include <stdlib.h>\n#include <math.h>\n \n// CUDA kernel. Each thread takes care of one element of c\n__global__ void vecAdd(double *a, double *b, double *c, int n)\n{\n    // Get our global thread ID\n    int id = blockIdx.x*blockDim.x+threadIdx.x;\n \n    // Make sure we do not go out of bounds\n    if (id < n)\n        c[id] = a[id] + b[id];\n}\n \nint main( int argc, char* argv[] )\n{\n    // Size of vectors\n    int n = 100000;\n \n    // Host input vectors\n    double *h_a;\n    double *h_b;\n    //Host output vector\n    double *h_c;\n \n    // Device input vectors\n    double *d_a;\n    double *d_b;\n    //Device output vector\n    double *d_c;\n \n    // Size, in bytes, of each vector\n    size_t bytes = n*sizeof(double);\n \n    // Allocate memory for each vector on host\n    h_a = (double*)malloc(bytes);\n    h_b = (double*)malloc(bytes);\n    h_c = (double*)malloc(bytes);\n \n    // Allocate memory for each vector on GPU\n    cudaMalloc(&d_a, bytes);\n    cudaMalloc(&d_b, bytes);\n    cudaMalloc(&d_c, bytes);\n \n    int i;\n    // Initialize vectors on host\n    for( i = 0; i < n; i++ ) {\n        h_a[i] = sin(i)*sin(i);\n        h_b[i] = cos(i)*cos(i);\n    }\n \n    // Copy host vectors to device\n    cudaMemcpy( d_a, h_a, bytes, cudaMemcpyHostToDevice);\n    cudaMemcpy( d_b, h_b, bytes, cudaMemcpyHostToDevice);\n \n    int blockSize, gridSize;\n \n    // Number of threads in each thread block\n    blockSize = 1024;\n \n    // Number of thread blocks in grid\n    gridSize = (int)ceil((float)n/blockSize);\n \n    // Execute the kernel\n    vecAdd<<<gridSize, blockSize>>>(d_a, d_b, d_c, n);\n \n    // Copy array back to host\n    cudaMemcpy( h_c, d_c, bytes, cudaMemcpyDeviceToHost );\n \n    // Sum up vector c and print result divided by n, this should equal 1 within error\n    double sum = 0;\n    for(i=0; i<n; i++)\n        sum += h_c[i];\n    printf("final result: %f\\n", sum/n);\n \n    // Release device memory\n    cudaFree(d_a);\n    cudaFree(d_b);\n    cudaFree(d_c);\n \n    // Release host memory\n    free(h_a);\n    free(h_b);\n    free(h_c);\n \n    return 0;\n}\n')


# In[23]:


get_ipython().system('nvcc hello.cu -o hello.out')


# In[24]:


get_ipython().system('LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64 ./hello.out')


# In[ ]: