本文主要是介绍测试cuda trap指令在cuda-gdb下的行为,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
测试cuda trap指令在cuda-gdb下的行为
- 1.测试小结
- 2.测试步骤
本文测试cuda trap指令在cuda-gdb下的行为)
1.测试小结
- cuda-gdb遇到trap指令后,当前的warp会停住
- 运行continue后,可以继续运行下一条指令
- 仅当前的warp会被停住,其它warp正常执行(通过cuda-gdb的代码行号以及kernel里的加时间戳可以判断)
2.测试步骤
tee trap_inst_benchmark.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>__global__ void kernel(float *output_data,unsigned long long*output_ts,unsigned int*output_smid) {int tid = threadIdx.x + blockIdx.x * blockDim.x;unsigned int smid;clock_t ts=clock64();asm volatile("mov.u32 %0, %smid;" : "=r"(smid));output_smid[tid]=smid;output_ts[tid]=ts;float val=tid;asm("st.global.wt.f32 [%0],%1;" :: "l"(&output_data[tid]),"f"(val));asm("discard.global.L2 [%0],128;" :: "l"(&output_data[tid]));asm("discard.global.L2 [%0],128;" :: "l"(&output_ts[tid]));unsigned int ts0;unsigned int ts1;asm volatile ("mov.u32 %0, %%clock;" : "=r"(ts0) :: "memory"); if(tid==15){ printf("trap in tid:%d smid:%d ts:%ld\n",tid,smid,clock64());__trap();//assert(0);//__brkpt();//__trap();printf("trap out tid:%d smid:%d ts:%ld\n",tid,smid,clock64());} asm volatile ("mov.u32 %0, %%clock;" : "=r"(ts1) :: "memory");printf("kernel smid:%d tid:%04d val:%6.2f ts:%ld\n",smid,tid,output_data[tid],ts1-ts0);
}#define CHECK_CUDA(call) \do { \cudaError_t err = call; \if (err != cudaSuccess) { \std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__; \std::cerr << " code=" << err << " (" << cudaGetErrorString(cudaGetLastError()) << ")" << std::endl; \} \} while (0)int main(int argc,char *argv[])
{int deviceid=0;cudaSetDevice(deviceid); int block_size=1;int thread_block_size=32*4;int thread_size=thread_block_size*block_size;int data_size=sizeof(float)*thread_size;int ts_size=sizeof(unsigned long long)*thread_size;int smid_size=sizeof(int)*thread_size;float *dev_output_data=nullptr;unsigned long long* dev_output_ts=nullptr;unsigned int* dev_smid=nullptr;float *host_output_data=new float[thread_size];unsigned long long*host_output_ts=new unsigned long long[thread_size];;unsigned int* host_smid=new unsigned int[thread_size];CHECK_CUDA(cudaMalloc((void**)&dev_output_data, data_size));CHECK_CUDA(cudaMalloc((void**)&dev_output_ts, ts_size));CHECK_CUDA(cudaMalloc((void**)&dev_smid, smid_size));CHECK_CUDA(cudaMemcpy(dev_output_data,host_output_data,data_size,cudaMemcpyHostToDevice));CHECK_CUDA(cudaMemcpy(dev_output_ts,host_output_ts,ts_size,cudaMemcpyHostToDevice));CHECK_CUDA(cudaMemcpy(dev_smid,host_smid,smid_size,cudaMemcpyHostToDevice));printf("dev_output_data:%p\n",dev_output_data);printf("dev_output_ts:%p\n",dev_output_ts);printf("dev_smid:%p\n",dev_smid);kernel<<<block_size, thread_block_size>>>(dev_output_data,dev_output_ts,dev_smid);//如果Kernel里出现异常后,后面的CUDA API调用都会返回失败,自然也就得不到设备内存里的数据CHECK_CUDA(cudaDeviceSynchronize());CHECK_CUDA(cudaMemcpy(host_output_data,dev_output_data,data_size,cudaMemcpyDeviceToHost));CHECK_CUDA(cudaMemcpy(host_output_ts,dev_output_ts,ts_size,cudaMemcpyDeviceToHost));CHECK_CUDA(cudaMemcpy(host_smid,dev_smid,smid_size,cudaMemcpyDeviceToHost));for(int i=0;i<thread_size;i++){// printf("tid:%04d smid:%08d val:%6.2f ts:%lld\n",i,host_smid[i],host_output_data[i],host_output_ts[i]);}CHECK_CUDA(cudaFree(dev_output_data));CHECK_CUDA(cudaFree(dev_output_ts));return 0;
}
EOF/usr/local/cuda/bin/nvcc -std=c++17 -arch=sm_86 -g -lineinfo -o trap_inst_benchmark trap_inst_benchmark.cu -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
/usr/local/cuda/bin/cuda-gdb ./trap_inst_benchmark
输出
kernel smid:0 tid:0116 val:116.00 ts:53
kernel smid:0 tid:0117 val:117.00 ts:53
kernel smid:0 tid:0118 val:118.00 ts:53
kernel smid:0 tid:0119 val:119.00 ts:53
kernel smid:0 tid:0120 val:120.00 ts:53
kernel smid:0 tid:0121 val:121.00 ts:53
kernel smid:0 tid:0122 val:122.00 ts:53
kernel smid:0 tid:0123 val:123.00 ts:53
kernel smid:0 tid:0124 val:124.00 ts:53
kernel smid:0 tid:0125 val:125.00 ts:53
kernel smid:0 tid:0126 val:126.00 ts:53
kernel smid:0 tid:0127 val:127.00 ts:53
trap out tid:15 smid:0 ts:3320272720798
kernel smid:0 tid:0000 val: 0.00 ts:28106907
kernel smid:0 tid:0001 val: 1.00 ts:28106907
kernel smid:0 tid:0002 val: 2.00 ts:28106907
kernel smid:0 tid:0003 val: 3.00 ts:28106907
kernel smid:0 tid:0004 val: 4.00 ts:28106907
kernel smid:0 tid:0005 val: 5.00 ts:28106907
kernel smid:0 tid:0006 val: 6.00 ts:28106907
kernel smid:0 tid:0007 val: 7.00 ts:28106907
kernel smid:0 tid:0008 val: 8.00 ts:28106907
这篇关于测试cuda trap指令在cuda-gdb下的行为的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!