GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 125 | 254.00 | 0.12 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
maxwell_scudnn_128x128_relu_interior_nn | 7 | 329.50 | 0.16 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
maxwell_scudnn_128x32_relu_interior_nn | 10 | 539.33 | 0.26 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
maxwell_scudnn_128x32_relu_small_nn | 96 | 976.00 | 0.47 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
maxwell_scudnn_128x64_relu_interior_nn | 9 | 601.67 | 0.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148t_nt | 6127 | 61536.00 | 29.93 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 52 | 273.67 | 0.13 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void cudnn::detail::explicit_convolve_sgemm<float, int, 1024, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 915 | 5707.12 | 2.78 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 1 | 86.00 | 0.04 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void cudnn::detail::implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0 | 217.00 | 0.11 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 12.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6127 | 24512.00 | 11.92 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 917 | 5048.00 | 2.46 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::clip, float*, float*, float, float>(int, float*, float*, float, float) | 35 | 152.50 | 0.07 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 9 | 21.50 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
Showing 1 to 15 of 15 entries