GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4 | 16.67 | 0.19 | 0 | 1792.00 | 29013.33 | 6.06 | 0.00 | 0.00 | true |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 50 | 318.67 | 3.68 | 62333952 | 2314730.67 | 27837536.00 | 26.31 | 2.07 | 195.61 | true |
void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 34 | 2124.00 | 24.52 | 5108182528 | 64023082.67 | 22062677.33 | 5.30 | 59.34 | 2404.98 | false |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 8.67 | 0.10 | 144028 | 3754.67 | 0.00 | 11.80 | 38.36 | 16.62 | false |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 9.33 | 0.11 | 200704 | 5333.33 | 1144288.00 | 53.90 | 0.17 | 21.50 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12 | 152.00 | 1.75 | 52977664 | 33162325.33 | 47198773.33 | 39.06 | 0.66 | 348.54 | true |
void gemv2T_kernel_val<int, int, float, float, float, 128, 16, 4, 4, false, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>, float, float) | 0 | 18.00 | 0.21 | 4431000 | 8196416.00 | 964053.33 | 9.80 | 0.48 | 246.17 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0 | 4.00 | 0.05 | 1000 | 5760.00 | 0.00 | 12.20 | 0.17 | 0.25 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>) | 0 | 3.00 | 0.03 | 0 | 224.00 | 0.00 | 11.90 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 15 | 95.33 | 1.10 | 5519360 | 17309685.33 | 7716874.67 | 57.20 | 0.22 | 57.90 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 49 | 231.00 | 2.67 | 19618816 | 161920.00 | 7398954.67 | 51.31 | 2.59 | 84.93 | true |
volta_scudnn_128x64_relu_interior_nn_v1 | 3 | 97.00 | 1.12 | 425984000 | 272000.00 | 10454976.00 | 8.07 | 39.71 | 4391.59 | false |
volta_scudnn_128x64_relu_small_nn_v1 | 0 | 143.67 | 1.66 | 264470528 | 600352.00 | 172096.00 | 6.20 | 342.38 | 1840.86 | false |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 12 | 735.00 | 8.49 | 2055782400 | 33997973.33 | 14390058.67 | 12.50 | 42.49 | 2796.99 | false |
Showing 1 to 14 of 14 entries