View on GitHub

Notes

reference notes

Programming on GPGPU: CUDA Parallel Computing

Central Processing Unit (CPU)

Common CPU

CPU Functionality

Multiple Processing Unit

Definition

Using Parallel Programs

Multi-core vs Many-core

Multicore CPU

Many-core GPU

Throughput Ratio

GPU vs CPU

GPU Characteristics

CPU Characteristics

GPU Technology

GPU Usage

Heterogeneous Computing

GPU Architecture

Compute Unified Device Architecture (CUDA)

Introduction

CUDA Programming Model

Scalability

Why CUDA?

Business Rationale

Technical Rationale

GPU Memory Types – Performance Comparison

  1. Register Memory
  2. Shared Memory
  3. Constant Memory
  4. Texture Memory
  5. Local Memory

Memory Types Overview

Memory Features

| Memory Type | Visibility | Existence | Performance | |————-|————|———–|————-| | Register | Thread | Thread | Fastest | | Local | Thread | Thread | Slower than register | | Shared | Block | Block | N/A | | Global | Application | Host | N/A |

Programming on Heterogeneous Computing Environment

Simple Processing Flow

  1. Copy input data from CPU memory to GPU memory.
  2. Load GPU code and execute it.
  3. Copy results from GPU memory to CPU memory.

Program Examples

Example without Device Code

int main(void) {
    printf("Hello World!\n");
    return 0;
}

Compiled using nvcc: nvcc HelloWorld.cu

Example with Device Code

__global__ void mykernel(void) {
    // In this example, this function does nothing.
    // Stay calm.
}

int main(void) {
    mykernel<<<1,1>>>();
    printf("Hello World!\n");
    return 0;
}

Example 2: Adding 2 Integers

__global__ void add(int *a, int *b, int *c) {
    *c = *a + *b;
}

int main(void) {
    // Host copies of a, b, c
    int a, b, c;
    
    // Device copies of a, b, c
    int *d_a, *d_b, *d_c;
    
    // Allocate space for device copies of a, b, c
    cudaMalloc((void **)&d_a, size);
    cudaMalloc((void **)&d_b, size);
    cudaMalloc((void **)&d_c, size);
    
    // Setup input values
    a = 2;
    b = 7;

    // Copy inputs to device
    cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

    // Launch add() kernel on GPU
    add<<<1,1>>>(d_a, d_b, d_c);

    // Copy result back to host
    cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);

    // Cleanup
    cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);

    return 0;
}

Memory Management

Host and Device Memory

Addition on the Device

__global__ void add(int *a, int *b, int *c) {
   *c = *a + *b;
}

Moving to Parallel

Vector Addition on the Device

__global__ void add(int *a, int *b, int *c) {
   c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}

Vector Addition on the Device

__global__ void add(int *a, int *b, int *c) {
   c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}
c[0] = a[0] + b[0]; c[1] = a[1] + b[1]; c[2] = a[2] + b[2]; c[3] = a[3] + b[3];

Vector Addition on the Device: add()

__global__ void add(int *a, int *b, int *c) {
   c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];
}

Vector Addition on the Device: main()

#define N 512
int main(void) {
   int *a, *b, *c; // host copies of a, b, c
   int *d_a, *d_b, *d_c; // device copies of a, b, c
   int size = N * sizeof(int);

   // Alloc space for device copies of a, b, c
   cudaMalloc((void **)&d_a, size);
   cudaMalloc((void **)&d_b, size);
   cudaMalloc((void **)&d_c, size);

   // Alloc space for host copies of a, b, c and setup input values
   a = (int *)malloc(size); random_ints(a, N);
   b = (int *)malloc(size); random_ints(b, N);
   c = (int *)malloc(size);

Vector Addition on the Device: main()

   // Copy inputs to device
   cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
   cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

   // Launch add() kernel on GPU with N blocks
   add<<<N,1>>>(d_a, d_b, d_c);

   // Copy result back to host
   cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

   // Cleanup
   free(a); free(b); free(c);
   cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
   return 0;
}

Review (1 of 2)

Review (2 of 2)

Introducing Threads

CUDA Threads

__global__ void add(int *a, int *b, int *c) {
   c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];
}

Vector Addition Using Threads: main()

#define N 512
int main(void) {
   int *a, *b, *c; // host copies of a, b, c
   int *d_a, *d_b, *d_c; // device copies of a, b, c
   int size = N * sizeof(int);

   // Alloc space for device copies of a, b, c
   cudaMalloc((void **)&d_a, size);
   cudaMalloc((void **)&d_b, size);
   cudaMalloc((void **)&d_c, size);

   // Alloc space for host copies of a, b, c and setup input values
   a = (int *)malloc(size); random_ints(a, N);
   b = (int *)malloc(size); random_ints(b, N);
   c = (int *)malloc(size);

Vector Addition Using Threads: main()

   // Copy inputs to device
   cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
   cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

   // Launch add() kernel on GPU with N threads
   add<<<1,N>>>(d_a, d_b, d_c);

   // Copy result back to host
   cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

   // Cleanup
   free(a); free(b); free(c);
   cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
   return 0;
}

Combining Threads and Blocks

Combining Blocks and Threads

__global__ void add(int *a, int *b, int *c) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   c[index] = a[index] + b[index];
}

Addition with Blocks and Threads: main()

#define N (2048*2048)
#define THREADS_PER_BLOCK 512
int main(void) {
   int *a, *b, *c; // host copies of a, b, c
   int *d_a, *d_b, *d_c; // device copies of a, b, c
   int size = N * sizeof(int);

   // Alloc space for device copies of a, b, c
   cudaMalloc((void **)&d_a, size);
   cudaMalloc((void **)&d_b, size);
   cudaMalloc((void **)&d_c, size);

   // Alloc space for host copies of a, b, c and setup input values
   a = (int *)malloc(size); random_ints(a, N);
   b = (int *)malloc(size); random_ints(b, N);
   c = (int *)malloc(size);

Addition with Blocks and Threads: main()

   // Copy inputs to device
   cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
   cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);

   // Launch add() kernel on GPU
   add<<<(N + THREADS_PER_BLOCK-1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a, d_b, d_c);

   // Copy result back to host
   cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);

   // Cleanup
   free(a); free(b); free(c);
   cudaFree(d_a); cudaFree(d_b); cudaFree(d_c);
   return 0;
}

Handling Arbitrary Vector Sizes

add<<<(N + M-1) / M, M>>>(d_a, d_b, d_c, N);
__global__ void add(int *a, int *b, int *c, int n) {
   int index = threadIdx.x + blockIdx.x * blockDim.x;
   if (index < n)
      c[index] = a[index] + b[index];
}

SAXPY Example (Single-Precision A·X Plus Y)

Why Bother with Threads?

Sharing Data Between Threads

__syncthreads()

Dot Product Overview

Parallel Threads for Pairwise Products

__global__ void dot(int *a, int *b, int *c) {
    // Each thread computes a pairwise product
    int temp = a[threadIdx.x] * b[threadIdx.x];
    // Can’t compute the final sum
    // Each thread’s copy of ‘temp’ is private
}

Sharing Data Between Threads

Parallel Dot Product

#define N 512
__global__ void dot(int *a, int *b, int *c) {
    // Shared memory for results of multiplication
    __shared__ int temp[N];
    temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
    // Thread 0 sums the pairwise products
    if (0 == threadIdx.x) {
        int sum = 0;
        for (int i = 0; i < N; i++)
            sum += temp[i];
        *c = sum;
    }
}

Synchronization with __syncthreads()

__global__ void dot(int *a, int *b, int *c) {
    __shared__ int temp[N];
    temp[threadIdx.x] = a[threadIdx.x] * b[threadIdx.x];
    __syncthreads();
    if (0 == threadIdx.x) {
        int sum = 0;
        for (int i = 0; i < N; i++)
            sum += temp[i];
        *c = sum;
    }
}

Multiblock Dot Product

#define N (2048*2048)
#define THREADS_PER_BLOCK 512
__global__ void dot(int *a, int *b, int *c) {
    __shared__ int temp[THREADS_PER_BLOCK];
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    temp[threadIdx.x] = a[index] * b[index];
    __syncthreads();
    if (0 == threadIdx.x) {
        int sum = 0;
        for (int i = 0; i < THREADS_PER_BLOCK; i++)
            sum += temp[i];
        atomicAdd(c, sum);
    }
}

Atomic Operations to Resolve Race Conditions

#define N (2048*2048)
#define THREADS_PER_BLOCK 512
__global__ void dot(int *a, int *b, int *c) {
    __shared__ int temp[THREADS_PER_BLOCK];
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    temp[threadIdx.x] = a[index] * b[index];
    __syncthreads();
    if (0 == threadIdx.x) {
        int sum = 0;
        for (int i = 0; i < THREADS_PER_BLOCK; i++)
            sum += temp[i];
        atomicAdd(c, sum);
    }
}

Main Function for Multiblock Dot Product

#define N (2048*2048)
#define THREADS_PER_BLOCK 512
int main(void) {
    int *a, *b, *c; // host copies of a, b, c
    int *dev_a, *dev_b, *dev_c; // device copies of a, b, c
    int size = N * sizeof(int); // space for N integers
    // allocate device copies of a, b, c
    cudaMalloc((void**)&dev_a, size);
    cudaMalloc((void**)&dev_b, size);
    cudaMalloc((void**)&dev_c, sizeof(int));
    a = (int *)malloc(size);
    b = (int *)malloc(size);
    c = (int *)malloc(sizeof(int));
    random_ints(a, N);
    random_ints(b, N);
    // copy inputs to device
    cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
    // launch dot() kernel
    dot<<<N/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(dev_a, dev_b, dev_c);
    // copy device result back to host copy of c
    cudaMemcpy(c, dev_c, sizeof(int) , cudaMemcpyDeviceToHost);
    free(a); free(b); free(c);
    cudaFree(dev_a);
    cudaFree(dev_b);
    cudaFree(dev_c);
    return 0;
}

Review

References