HIP技术测试2-CPU2DCU和DCU2DCU带宽测试

本文主要是介绍HIP技术测试2-CPU2DCU和DCU2DCU带宽测试，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

HIP技术测试2-CPU2DCU和DCU2DCU带宽测试

1.节点内CPU2DCU带宽测试

#include <stdio.h> 
#include <hip/hip_runtime.h> #define NSTREAM 2 
#define BDIM 512 void initialData(float *ip, int size) 
{int i;for (i = 0; i < size; i++) {ip[i] = (float)(rand() & 0xFF) / 10.0f;         //printf("%f\n", ip[i]);     } 
} void sumArraysOnHost(float *A, float *B, float *C, const int N) 
{for (int idx = 0; idx < N; idx++)         C[idx] = A[idx] + B[idx];
}__global__ void sumArrays(float *A, float *B, float *C, const int N) 
{int idx = blockIdx.x * blockDim.x + threadIdx.x;if (idx < N) { for (int j = 0; j < 60; j++) { C[idx] = A[idx] + B[idx]; } }
}void checkResult(float *hostRef, float *gpuRef, const int N) {double epsilon = 1.0E-8;     bool match = 1;for (int i = 0; i < N; i++) { if (abs(hostRef[i] - gpuRef[i]) > epsilon) { match = 0;             printf("Arrays do not match!\n");             printf("host %5.2f gpu %5.2f at %d\n", hostRef[i], gpuRef[i], i);             break; } }if (match) printf("Arrays match.\n\n");
}int main(int argc, char **argv) 
{printf("> %s Starting...\n", argv[0]);int dev = 0;     hipSetDevice(dev);     hipDeviceProp_t deviceProp;     hipGetDeviceProperties(&deviceProp, dev);printf("> Using Device %d: %s\n", dev, deviceProp.name);// set up data size of vectors     //int nElem = 1 << 2;     int nElem = 1 << 24;     printf("> vector size = %d\n", nElem);     size_t nBytes = nElem * sizeof(float);     printf("> size nBytes = %ld MB\n", nBytes/1024/1024); float *h_A, *h_B, *h_C;     hipHostMalloc((void**)&h_A, nBytes, hipHostMallocDefault);     hipHostMalloc((void**)&h_B, nBytes, hipHostMallocDefault);     hipHostMalloc((void**)&h_C, nBytes, hipHostMallocDefault);initialData(h_A, nElem);     initialData(h_B, nElem);     memset(h_C, 0, nBytes);//sumArraysOnHost(h_A, h_B, hostRef, nElem); float *d_A, *d_B, *d_C;     hipMalloc((float**)&d_A, nBytes);     hipMalloc((float**)&d_B, nBytes);     hipMalloc((float**)&d_C, nBytes);hipEvent_t start, stop;     hipEventCreate(&start);     hipEventCreate(&stop);dim3 block(BDIM);     dim3 grid((nElem + block.x - 1) / block.x);     printf("> grid (%d,%d) block (%d,%d)\n", grid.x, grid.y, block.x, block.y);hipMemcpy(d_A, h_A, nBytes, hipMemcpyHostToDevice);     hipMemcpy(d_B, h_B, nBytes, hipMemcpyHostToDevice);    hipLaunchKernelGGL(sumArrays, dim3(grid), dim3(block), 0, 0, d_A, d_B, d_C, nElem);     hipMemcpy(h_C, d_C, nBytes, hipMemcpyDeviceToHost);hipEventRecord(start, 0);     hipMemcpy(d_A, h_A, nBytes, hipMemcpyHostToDevice);     hipMemcpy(d_B, h_B, nBytes, hipMemcpyHostToDevice);     hipEventRecord(stop, 0);     hipEventSynchronize(stop);     float memcpy_h2d_time;     hipEventElapsedTime(&memcpy_h2d_time, start, stop);hipEventRecord(start, 0);     hipLaunchKernelGGL(sumArrays, dim3(grid), dim3(block), 0, 0, d_A, d_B, d_C, nElem);     hipEventRecord(stop, 0);     hipEventSynchronize(stop);     float kernel_time;     hipEventElapsedTime(&kernel_time, start, stop);hipEventRecord(start, 0);     hipMemcpy(h_C, d_C, nBytes, hipMemcpyDeviceToHost);     hipEventRecord(stop, 0);     hipEventSynchronize(stop);     float memcpy_d2h_time;     hipEventElapsedTime(&memcpy_d2h_time, start, stop);printf("Measured timings (throughput):\n");     printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (2 * nBytes * 1e-6) / memcpy_h2d_time);     printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (nBytes * 1e-6) / memcpy_d2h_time);     printf(" Kernel time: %f ms\n", kernel_time);     float total_time = memcpy_h2d_time + memcpy_d2h_time + kernel_time;     printf(" Total time: %f ms\n", total_time);//check device results     //checkResult(hostRef, gpuRef, nElem); // free device global memory     hipFree(d_A); hipFree(d_B);     hipFree(d_C);// free host memory     hipHostFree(h_A);     hipHostFree(h_B);     hipHostFree(h_C); hipEventDestroy(start);     hipEventDestroy(stop);hipDeviceReset();     return 0; 
}

2.节点内DCU2DCU带宽测试

#include "hip/hip_runtime.h"
#include "../common/common.h"
#include <stdlib.h>
#include <stdio.h>
#include <hip/hip_runtime.h>/** This example demonstrates P2P ping-ponging of data from one GPU to another,* within the same node. By enabling peer-to-peer transfers, you ensure that* copies between GPUs go directly over the PCIe bus. If P2P is not enabled,* host memory must be used as a staging area for GPU-to-GPU cudaMemcpys.*/__global__ void iKernel(float *src, float *dst)
{const int idx = blockIdx.x * blockDim.x + threadIdx.x;dst[idx] = src[idx] * 2.0f;
}inline bool isCapableP2P(int ngpus)
{hipDeviceProp_t prop[ngpus];int iCount = 0;for (int i = 0; i < ngpus; i++){CHECK(hipGetDeviceProperties(&prop[i], i));if (prop[i].major >= 2) iCount++;printf("> GPU%d: %s %s capable of Peer-to-Peer access\n", i,prop[i].name, (prop[i].major >= 2 ? "is" : "not"));}if(iCount != ngpus){printf("> no enough device to run this application\n");}return (iCount == ngpus);
}/** enable P2P memcopies between GPUs (all GPUs must be compute capability 2.0 or* later (Fermi or later)).*/
inline void enableP2P (int ngpus)
{for( int i = 0; i < ngpus; i++ ){CHECK(hipSetDevice(i));for(int j = 0; j < ngpus; j++){if(i == j) continue;int peer_access_available = 0;CHECK(hipDeviceCanAccessPeer(&peer_access_available, i, j));if (peer_access_available){CHECK(hipDeviceEnablePeerAccess(j, 0));printf("> GPU%d enabled direct access to GPU%d\n", i, j);}else{printf("(%d, %d)\n", i, j );}}}
}inline void disableP2P (int ngpus)
{for( int i = 0; i < ngpus; i++ ){CHECK(hipSetDevice(i));for(int j = 0; j < ngpus; j++){if( i == j ) continue;int peer_access_available = 0;CHECK(hipDeviceCanAccessPeer( &peer_access_available, i, j) );if( peer_access_available ){CHECK(hipDeviceDisablePeerAccess(j));printf("> GPU%d disabled direct access to GPU%d\n", i, j);}}}
}void initialData(float *ip, int size)
{for(int i = 0; i < size; i++){ip[i] = (float)rand() / (float)RAND_MAX;}
}int main(int argc, char **argv)
{int ngpus;// check device countCHECK(hipGetDeviceCount(&ngpus));printf("> CUDA-capable device count: %i\n", ngpus);// check p2p capabilityisCapableP2P(ngpus);// get ngpus from command lineif (argc > 1){if (atoi(argv[1]) > ngpus){fprintf(stderr, "Invalid number of GPUs specified: %d is greater ""than the total number of GPUs in this platform (%d)\n",atoi(argv[1]), ngpus);return 1;}ngpus  = atoi(argv[1]);}if (ngpus < 2){fprintf(stderr, "No more than 2 GPUs supported\n");return 1;}if (ngpus > 1) enableP2P(ngpus);// Allocate buffersint iSize = 1<<24;const size_t iBytes = iSize * sizeof(float);printf("\nAllocating buffers (%iMB on each GPU and CPU Host)...\n",int(iBytes / 1024 / 1024));float **d_src = (float **)malloc(sizeof(float) * ngpus);float **d_rcv = (float **)malloc(sizeof(float) * ngpus);float **h_src = (float **)malloc(sizeof(float) * ngpus);hipStream_t *stream = (hipStream_t *)malloc(sizeof(hipStream_t) * ngpus);// Create CUDA event handleshipEvent_t start, stop;CHECK(hipSetDevice(0));CHECK(hipEventCreate(&start));CHECK(hipEventCreate(&stop));for (int i = 0; i < ngpus; i++){CHECK(hipSetDevice(i));CHECK(hipMalloc(&d_src[i], iBytes));CHECK(hipMalloc(&d_rcv[i], iBytes));CHECK(hipHostMalloc((void **) &h_src[i], iBytes));CHECK(hipStreamCreate(&stream[i]));}for (int i = 0; i < ngpus; i++){initialData(h_src[i], iSize);}// unidirectional gmem copyCHECK(hipSetDevice(0));CHECK(hipEventRecord(start, 0));for (int i = 0; i < 100; i++){if (i % 2 == 0){CHECK(hipMemcpy(d_src[1], d_src[0], iBytes,hipMemcpyDeviceToDevice));}else{CHECK(hipMemcpy(d_src[0], d_src[1], iBytes,hipMemcpyDeviceToDevice));}}CHECK(hipSetDevice(0));CHECK(hipEventRecord(stop, 0));CHECK(hipEventSynchronize(stop));float elapsed_time_ms;CHECK(hipEventElapsedTime(&elapsed_time_ms, start, stop ));elapsed_time_ms /= 100.0f;printf("Ping-pong unidirectional hipMemcpy:\t\t %8.2f ms ",elapsed_time_ms);printf("performance: %8.2f GB/s\n",(float)iBytes / (elapsed_time_ms * 1e6f));//  bidirectional asynchronous gmem copyCHECK(hipEventRecord(start, 0));for (int i = 0; i < 100; i++){CHECK(hipMemcpyAsync(d_src[1], d_src[0], iBytes,hipMemcpyDeviceToDevice, stream[0]));CHECK(hipMemcpyAsync(d_rcv[0], d_rcv[1], iBytes,hipMemcpyDeviceToDevice, stream[1]));}CHECK(hipSetDevice(0));CHECK(hipEventRecord(stop, 0));CHECK(hipEventSynchronize(stop));elapsed_time_ms = 0.0f;CHECK(hipEventElapsedTime(&elapsed_time_ms, start, stop ));elapsed_time_ms /= 100.0f;printf("Ping-pong bidirectional hipMemcpyAsync:\t %8.2fms ",elapsed_time_ms);printf("performance: %8.2f GB/s\n",(float) 2.0f * iBytes / (elapsed_time_ms * 1e6f) );disableP2P(ngpus);// freeCHECK(hipSetDevice(0));CHECK(hipEventDestroy(start));CHECK(hipEventDestroy(stop));for (int i = 0; i < ngpus; i++){CHECK(hipSetDevice(i));CHECK(hipFree(d_src[i]));CHECK(hipFree(d_rcv[i]));CHECK(hipStreamDestroy(stream[i]));CHECK(hipDeviceReset());}exit(EXIT_SUCCESS);
}