HIP技术测试2-CPU2DCU和DCU2DCU带宽测试

2023-11-06 08:50

本文主要是介绍HIP技术测试2-CPU2DCU和DCU2DCU带宽测试,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!

HIP技术测试2-CPU2DCU和DCU2DCU带宽测试

1.节点内CPU2DCU带宽测试

#include <stdio.h> 
#include <hip/hip_runtime.h> #define NSTREAM 2 
#define BDIM 512 void initialData(float *ip, int size) 
{int i;for (i = 0; i < size; i++) {ip[i] = (float)(rand() & 0xFF) / 10.0f;         //printf("%f\n", ip[i]);     } 
} void sumArraysOnHost(float *A, float *B, float *C, const int N) 
{for (int idx = 0; idx < N; idx++)         C[idx] = A[idx] + B[idx];
}__global__ void sumArrays(float *A, float *B, float *C, const int N) 
{int idx = blockIdx.x * blockDim.x + threadIdx.x;if (idx < N) { for (int j = 0; j < 60; j++) { C[idx] = A[idx] + B[idx]; } }
}void checkResult(float *hostRef, float *gpuRef, const int N) {double epsilon = 1.0E-8;     bool match = 1;for (int i = 0; i < N; i++) { if (abs(hostRef[i] - gpuRef[i]) > epsilon) { match = 0;             printf("Arrays do not match!\n");             printf("host %5.2f gpu %5.2f at %d\n", hostRef[i], gpuRef[i], i);             break; } }if (match) printf("Arrays match.\n\n");
}int main(int argc, char **argv) 
{printf("> %s Starting...\n", argv[0]);int dev = 0;     hipSetDevice(dev);     hipDeviceProp_t deviceProp;     hipGetDeviceProperties(&deviceProp, dev);printf("> Using Device %d: %s\n", dev, deviceProp.name);// set up data size of vectors     //int nElem = 1 << 2;     int nElem = 1 << 24;     printf("> vector size = %d\n", nElem);     size_t nBytes = nElem * sizeof(float);     printf("> size nBytes = %ld MB\n", nBytes/1024/1024); float *h_A, *h_B, *h_C;     hipHostMalloc((void**)&h_A, nBytes, hipHostMallocDefault);     hipHostMalloc((void**)&h_B, nBytes, hipHostMallocDefault);     hipHostMalloc((void**)&h_C, nBytes, hipHostMallocDefault);initialData(h_A, nElem);     initialData(h_B, nElem);     memset(h_C, 0, nBytes);//sumArraysOnHost(h_A, h_B, hostRef, nElem); float *d_A, *d_B, *d_C;     hipMalloc((float**)&d_A, nBytes);     hipMalloc((float**)&d_B, nBytes);     hipMalloc((float**)&d_C, nBytes);hipEvent_t start, stop;     hipEventCreate(&start);     hipEventCreate(&stop);dim3 block(BDIM);     dim3 grid((nElem + block.x - 1) / block.x);     printf("> grid (%d,%d) block (%d,%d)\n", grid.x, grid.y, block.x, block.y);hipMemcpy(d_A, h_A, nBytes, hipMemcpyHostToDevice);     hipMemcpy(d_B, h_B, nBytes, hipMemcpyHostToDevice);    hipLaunchKernelGGL(sumArrays, dim3(grid), dim3(block), 0, 0, d_A, d_B, d_C, nElem);     hipMemcpy(h_C, d_C, nBytes, hipMemcpyDeviceToHost);hipEventRecord(start, 0);     hipMemcpy(d_A, h_A, nBytes, hipMemcpyHostToDevice);     hipMemcpy(d_B, h_B, nBytes, hipMemcpyHostToDevice);     hipEventRecord(stop, 0);     hipEventSynchronize(stop);     float memcpy_h2d_time;     hipEventElapsedTime(&memcpy_h2d_time, start, stop);hipEventRecord(start, 0);     hipLaunchKernelGGL(sumArrays, dim3(grid), dim3(block), 0, 0, d_A, d_B, d_C, nElem);     hipEventRecord(stop, 0);     hipEventSynchronize(stop);     float kernel_time;     hipEventElapsedTime(&kernel_time, start, stop);hipEventRecord(start, 0);     hipMemcpy(h_C, d_C, nBytes, hipMemcpyDeviceToHost);     hipEventRecord(stop, 0);     hipEventSynchronize(stop);     float memcpy_d2h_time;     hipEventElapsedTime(&memcpy_d2h_time, start, stop);printf("Measured timings (throughput):\n");     printf(" Memcpy host to device\t: %f ms (%f GB/s)\n", memcpy_h2d_time, (2 * nBytes * 1e-6) / memcpy_h2d_time);     printf(" Memcpy device to host\t: %f ms (%f GB/s)\n", memcpy_d2h_time, (nBytes * 1e-6) / memcpy_d2h_time);     printf(" Kernel time: %f ms\n", kernel_time);     float total_time = memcpy_h2d_time + memcpy_d2h_time + kernel_time;     printf(" Total time: %f ms\n", total_time);//check device results     //checkResult(hostRef, gpuRef, nElem); // free device global memory     hipFree(d_A); hipFree(d_B);     hipFree(d_C);// free host memory     hipHostFree(h_A);     hipHostFree(h_B);     hipHostFree(h_C); hipEventDestroy(start);     hipEventDestroy(stop);hipDeviceReset();     return 0; 
}

2.节点内DCU2DCU带宽测试

#include "hip/hip_runtime.h"
#include "../common/common.h"
#include <stdlib.h>
#include <stdio.h>
#include <hip/hip_runtime.h>/** This example demonstrates P2P ping-ponging of data from one GPU to another,* within the same node. By enabling peer-to-peer transfers, you ensure that* copies between GPUs go directly over the PCIe bus. If P2P is not enabled,* host memory must be used as a staging area for GPU-to-GPU cudaMemcpys.*/__global__ void iKernel(float *src, float *dst)
{const int idx = blockIdx.x * blockDim.x + threadIdx.x;dst[idx] = src[idx] * 2.0f;
}inline bool isCapableP2P(int ngpus)
{hipDeviceProp_t prop[ngpus];int iCount = 0;for (int i = 0; i < ngpus; i++){CHECK(hipGetDeviceProperties(&prop[i], i));if (prop[i].major >= 2) iCount++;printf("> GPU%d: %s %s capable of Peer-to-Peer access\n", i,prop[i].name, (prop[i].major >= 2 ? "is" : "not"));}if(iCount != ngpus){printf("> no enough device to run this application\n");}return (iCount == ngpus);
}/** enable P2P memcopies between GPUs (all GPUs must be compute capability 2.0 or* later (Fermi or later)).*/
inline void enableP2P (int ngpus)
{for( int i = 0; i < ngpus; i++ ){CHECK(hipSetDevice(i));for(int j = 0; j < ngpus; j++){if(i == j) continue;int peer_access_available = 0;CHECK(hipDeviceCanAccessPeer(&peer_access_available, i, j));if (peer_access_available){CHECK(hipDeviceEnablePeerAccess(j, 0));printf("> GPU%d enabled direct access to GPU%d\n", i, j);}else{printf("(%d, %d)\n", i, j );}}}
}inline void disableP2P (int ngpus)
{for( int i = 0; i < ngpus; i++ ){CHECK(hipSetDevice(i));for(int j = 0; j < ngpus; j++){if( i == j ) continue;int peer_access_available = 0;CHECK(hipDeviceCanAccessPeer( &peer_access_available, i, j) );if( peer_access_available ){CHECK(hipDeviceDisablePeerAccess(j));printf("> GPU%d disabled direct access to GPU%d\n", i, j);}}}
}void initialData(float *ip, int size)
{for(int i = 0; i < size; i++){ip[i] = (float)rand() / (float)RAND_MAX;}
}int main(int argc, char **argv)
{int ngpus;// check device countCHECK(hipGetDeviceCount(&ngpus));printf("> CUDA-capable device count: %i\n", ngpus);// check p2p capabilityisCapableP2P(ngpus);// get ngpus from command lineif (argc > 1){if (atoi(argv[1]) > ngpus){fprintf(stderr, "Invalid number of GPUs specified: %d is greater ""than the total number of GPUs in this platform (%d)\n",atoi(argv[1]), ngpus);return 1;}ngpus  = atoi(argv[1]);}if (ngpus < 2){fprintf(stderr, "No more than 2 GPUs supported\n");return 1;}if (ngpus > 1) enableP2P(ngpus);// Allocate buffersint iSize = 1<<24;const size_t iBytes = iSize * sizeof(float);printf("\nAllocating buffers (%iMB on each GPU and CPU Host)...\n",int(iBytes / 1024 / 1024));float **d_src = (float **)malloc(sizeof(float) * ngpus);float **d_rcv = (float **)malloc(sizeof(float) * ngpus);float **h_src = (float **)malloc(sizeof(float) * ngpus);hipStream_t *stream = (hipStream_t *)malloc(sizeof(hipStream_t) * ngpus);// Create CUDA event handleshipEvent_t start, stop;CHECK(hipSetDevice(0));CHECK(hipEventCreate(&start));CHECK(hipEventCreate(&stop));for (int i = 0; i < ngpus; i++){CHECK(hipSetDevice(i));CHECK(hipMalloc(&d_src[i], iBytes));CHECK(hipMalloc(&d_rcv[i], iBytes));CHECK(hipHostMalloc((void **) &h_src[i], iBytes));CHECK(hipStreamCreate(&stream[i]));}for (int i = 0; i < ngpus; i++){initialData(h_src[i], iSize);}// unidirectional gmem copyCHECK(hipSetDevice(0));CHECK(hipEventRecord(start, 0));for (int i = 0; i < 100; i++){if (i % 2 == 0){CHECK(hipMemcpy(d_src[1], d_src[0], iBytes,hipMemcpyDeviceToDevice));}else{CHECK(hipMemcpy(d_src[0], d_src[1], iBytes,hipMemcpyDeviceToDevice));}}CHECK(hipSetDevice(0));CHECK(hipEventRecord(stop, 0));CHECK(hipEventSynchronize(stop));float elapsed_time_ms;CHECK(hipEventElapsedTime(&elapsed_time_ms, start, stop ));elapsed_time_ms /= 100.0f;printf("Ping-pong unidirectional hipMemcpy:\t\t %8.2f ms ",elapsed_time_ms);printf("performance: %8.2f GB/s\n",(float)iBytes / (elapsed_time_ms * 1e6f));//  bidirectional asynchronous gmem copyCHECK(hipEventRecord(start, 0));for (int i = 0; i < 100; i++){CHECK(hipMemcpyAsync(d_src[1], d_src[0], iBytes,hipMemcpyDeviceToDevice, stream[0]));CHECK(hipMemcpyAsync(d_rcv[0], d_rcv[1], iBytes,hipMemcpyDeviceToDevice, stream[1]));}CHECK(hipSetDevice(0));CHECK(hipEventRecord(stop, 0));CHECK(hipEventSynchronize(stop));elapsed_time_ms = 0.0f;CHECK(hipEventElapsedTime(&elapsed_time_ms, start, stop ));elapsed_time_ms /= 100.0f;printf("Ping-pong bidirectional hipMemcpyAsync:\t %8.2fms ",elapsed_time_ms);printf("performance: %8.2f GB/s\n",(float) 2.0f * iBytes / (elapsed_time_ms * 1e6f) );disableP2P(ngpus);// freeCHECK(hipSetDevice(0));CHECK(hipEventDestroy(start));CHECK(hipEventDestroy(stop));for (int i = 0; i < ngpus; i++){CHECK(hipSetDevice(i));CHECK(hipFree(d_src[i]));CHECK(hipFree(d_rcv[i]));CHECK(hipStreamDestroy(stream[i]));CHECK(hipDeviceReset());}exit(EXIT_SUCCESS);
}

在这里插入图片描述

这篇关于HIP技术测试2-CPU2DCU和DCU2DCU带宽测试的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!



http://www.chinasem.cn/article/355627

相关文章

使用Python进行GRPC和Dubbo协议的高级测试

《使用Python进行GRPC和Dubbo协议的高级测试》GRPC(GoogleRemoteProcedureCall)是一种高性能、开源的远程过程调用(RPC)框架,Dubbo是一种高性能的分布式服... 目录01 GRPC测试安装gRPC编写.proto文件实现服务02 Dubbo测试1. 安装Dubb

Qt如何实现文本编辑器光标高亮技术

《Qt如何实现文本编辑器光标高亮技术》这篇文章主要为大家详细介绍了Qt如何实现文本编辑器光标高亮技术,文中的示例代码讲解详细,具有一定的借鉴价值,有需要的小伙伴可以了解下... 目录实现代码函数作用概述代码详解 + 注释使用 QTextEdit 的高亮技术(重点)总结用到的关键技术点应用场景举例示例优化建议

Python的端到端测试框架SeleniumBase使用解读

《Python的端到端测试框架SeleniumBase使用解读》:本文主要介绍Python的端到端测试框架SeleniumBase使用,具有很好的参考价值,希望对大家有所帮助,如有错误或未考虑完全... 目录SeleniumBase详细介绍及用法指南什么是 SeleniumBase?SeleniumBase

Java中的登录技术保姆级详细教程

《Java中的登录技术保姆级详细教程》:本文主要介绍Java中登录技术保姆级详细教程的相关资料,在Java中我们可以使用各种技术和框架来实现这些功能,文中通过代码介绍的非常详细,需要的朋友可以参考... 目录1.登录思路2.登录标记1.会话技术2.会话跟踪1.Cookie技术2.Session技术3.令牌技

python多线程并发测试过程

《python多线程并发测试过程》:本文主要介绍python多线程并发测试过程,具有很好的参考价值,希望对大家有所帮助,如有错误或未考虑完全的地方,望不吝赐教... 目录一、并发与并行?二、同步与异步的概念?三、线程与进程的区别?需求1:多线程执行不同任务需求2:多线程执行相同任务总结一、并发与并行?1、

Web技术与Nginx网站环境部署教程

《Web技术与Nginx网站环境部署教程》:本文主要介绍Web技术与Nginx网站环境部署教程,具有很好的参考价值,希望对大家有所帮助,如有错误或未考虑完全的地方,望不吝赐教... 目录一、Web基础1.域名系统DNS2.Hosts文件3.DNS4.域名注册二.网页与html1.网页概述2.HTML概述3.

Java使用WebView实现桌面程序的技术指南

《Java使用WebView实现桌面程序的技术指南》在现代软件开发中,许多应用需要在桌面程序中嵌入Web页面,例如,你可能需要在Java桌面应用中嵌入一部分Web前端,或者加载一个HTML5界面以增强... 目录1、简述2、WebView 特点3、搭建 WebView 示例3.1 添加 JavaFX 依赖3

SpringBoot3实现Gzip压缩优化的技术指南

《SpringBoot3实现Gzip压缩优化的技术指南》随着Web应用的用户量和数据量增加,网络带宽和页面加载速度逐渐成为瓶颈,为了减少数据传输量,提高用户体验,我们可以使用Gzip压缩HTTP响应,... 目录1、简述2、配置2.1 添加依赖2.2 配置 Gzip 压缩3、服务端应用4、前端应用4.1 N

Java利用JSONPath操作JSON数据的技术指南

《Java利用JSONPath操作JSON数据的技术指南》JSONPath是一种强大的工具,用于查询和操作JSON数据,类似于SQL的语法,它为处理复杂的JSON数据结构提供了简单且高效... 目录1、简述2、什么是 jsONPath?3、Java 示例3.1 基本查询3.2 过滤查询3.3 递归搜索3.4

Python中随机休眠技术原理与应用详解

《Python中随机休眠技术原理与应用详解》在编程中,让程序暂停执行特定时间是常见需求,当需要引入不确定性时,随机休眠就成为关键技巧,下面我们就来看看Python中随机休眠技术的具体实现与应用吧... 目录引言一、实现原理与基础方法1.1 核心函数解析1.2 基础实现模板1.3 整数版实现二、典型应用场景2