本文共 2315 字,大约阅读时间需要 7 分钟。
92
using global memory
/*********************** using global memory ***********************/// a __global__ function runs on the GPU & can be called from host__global__ void use_global_memory_GPU(float *array)// note: the papameter can only be local and private to single thread. // Therefore, we resort to pointer to enable the function to access the global memory{ // "array" is a pointer into global memory on the device array[threadIdx.x] = 2.0f * (float) threadIdx.x;}int main(int argc, char **argv){ /* * First, call a kernel that shows using local memory */ use_local_memory_GPU<<<1, 128>>>(2.0f); /* * Next, call a kernel that shows using global memory */ float h_arr[128]; // convention: h_ variables live on host float *d_arr; // convention: d_ variables live on device (GPU global mem) // allocate global memory on the device, place result in "d_arr" cudaMalloc((void **) &d_arr, sizeof(float) * 128); // now copy data from host memory "h_arr" to device memory "d_arr" cudaMemcpy((void *)d_arr, (void *)h_arr, sizeof(float) * 128, cudaMemcpyHostToDevice); // launch the kernel (1 block of 128 threads) use_global_memory_GPU<<<1, 128>>>(d_arr); // modifies the contents of array at d_arr // copy the modified array back to the host, overwriting contents to h_arr cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * 128, cudaMemcpyDeviceToHost); // ... do other stuff ... return 0;}
shared memory
__global__ void use_shared_memory_GPU(float *array){ // local variables, private to each thread int i, index = threadIdx.x; float average, sum = 0.0f; // __shared__ variables are visible to all threads in the thread block // and have the same lifetime as the thread block __shared__ float sh_arr[128]; // copy data from "array" in global memory to sh_arr in shared memory. // here, each thread is responsible for copying a single element. sh_arr[index] = array[index]; __syncthreads(); // ensure all the writes to shared memory have completed // now, sh_arr is fully populated. Let's find the average of all previous elements for (i=0; iaverage) { array[index] = average; } // the following code has NO EFFECT: it modifies shared memory, but // the resulting modified data is never copied back to global memory // and vanishes when the thread block completes sh_arr[index] = 3.14; }