GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0 | 3.67 | 0.07 | 0 | 1706.67 | 42.67 | 7.40 | 0.00 | 0.00 | true |
void conv2d_c1_k1_nchw_hw_packed_kernel<float, float, 3>(cudnnTensorStruct, float const*, cudnnFilterStruct, float const*, cudnnConvolutionStruct, cudnnTensorStruct, float*, float, float, cudnn::reduced_divisor, cudnn::reduced_divisor, int) | 2 | 23.33 | 0.44 | 7902720 | 13290.67 | 4522.67 | 13.23 | 443.64 | 338.68 | false |
void conv2d_grouped_direct_kernel<float, float, float, float, float, true, false, 0, 1, 3>(cudnnTensorStruct, float const*, cudnnFilterStruct, float const*, cudnnConvolutionStruct, cudnnTensorStruct, float*, float, float, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, int, float const*, float const*, cudnnActivationStruct) | 12 | 68.67 | 1.30 | 10136880 | 118709.33 | 685109.33 | 50.03 | 12.61 | 147.63 | true |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 52 | 272.33 | 5.17 | 22232592 | 148010.67 | 3730645.34 | 13.16 | 5.73 | 81.64 | true |
void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 32 | 545.00 | 10.35 | 321097688 | 7638325.33 | 2683616.00 | 3.84 | 31.11 | 589.17 | false |
void cudnn::detail::implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 2 | 24.00 | 0.46 | 7598592 | 25610.67 | 26240.00 | 3.96 | 146.55 | 316.61 | false |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 8.00 | 0.15 | 83635 | 3840.00 | 5205.33 | 12.10 | 9.25 | 10.45 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 15 | 64.00 | 1.22 | 237568 | 640.00 | 0.00 | 6.20 | 371.20 | 3.71 | false |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::clip, float*, float*, float, float>(int, float*, float*, float, float) | 35 | 133.00 | 2.53 | 0 | 512.00 | 2627466.67 | 27.58 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 9 | 31.33 | 0.60 | 108192 | 0.00 | 25237.34 | 12.98 | 4.29 | 3.45 | true |
volta_scudnn_128x64_relu_interior_nn_v1 | 0 | 14.00 | 0.27 | 14450688 | 8192.00 | 765258.67 | 7.70 | 18.68 | 1032.19 | false |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 15 | 192.00 | 3.65 | 494534656 | 0.00 | 938.67 | 15.50 | 526845.01 | 2575.70 | false |
Showing 1 to 12 of 12 entries