GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 44 | 155.00 | 0.34 | 0 | 5866.67 | 580650.67 | 5.92 | 0.00 | 0.00 | true |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 120 | 6214.00 | 13.70 | 3149537280 | 1937702154.67 | 2023561184.00 | 73.05 | 0.80 | 506.85 | true |
void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 15 | 1224.33 | 2.70 | 4832952320 | 8117653.33 | 19475125.33 | 6.98 | 175.15 | 3947.41 | false |
void cudnn::detail::implicit_convolve_sgemm<float, float, 128, 6, 7, 3, 3, 5, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0 | 725.00 | 1.60 | 6579879936 | 832.00 | 3744.00 | 31.90 | 1437910.83 | 9075.70 | false |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 3 | 196.33 | 0.43 | 120310240 | 89293536.00 | 29608597.33 | 50.46 | 1.01 | 612.79 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 216.33 | 0.48 | 6422528 | 102768384.00 | 32166229.33 | 72.00 | 0.05 | 29.69 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 45 | 214.67 | 0.47 | 10928128 | 6794453.33 | 6625728.00 | 6.20 | 0.81 | 50.91 | true |
void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 11 | 766.00 | 1.69 | 578027520 | 152517248.00 | 350608853.33 | 42.88 | 1.15 | 754.61 | true |
void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 11 | 62.00 | 0.14 | 3981312 | 1777749.33 | 4575221.33 | 12.32 | 0.63 | 64.21 | true |
void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 11 | 219.00 | 0.48 | 198795264 | 68345685.33 | 40892938.67 | 36.31 | 1.82 | 907.74 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0 | 4.33 | 0.01 | 32000 | 6442.67 | 4309.33 | 19.00 | 2.98 | 7.39 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 113 | 3544.66 | 7.81 | 0 | 1134910378.67 | 1140702688.00 | 77.20 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanLargeKernel<mshadow::sv::saveto, 8, 1024, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>, int) | 1 | 505.00 | 1.11 | 0 | 166998720.00 | 162589770.67 | 82.19 | 0.00 | 0.00 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 120 | 5389.00 | 11.88 | 1002717184 | 1851847402.67 | 1939018549.34 | 93.98 | 0.26 | 186.07 | true |
volta_scudnn_128x64_relu_interior_nn_v1 | 43 | 8573.00 | 18.90 | 83133202432 | 1859480864.00 | 571192309.33 | 17.19 | 34.20 | 9697.10 | false |
volta_scudnn_128x64_relu_medium_nn_v1 | 0 | 670.67 | 1.48 | 7861174272 | 21.33 | 5205.33 | 24.30 | 1504051.39 | 11721.43 | false |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 45 | 3685.00 | 8.12 | 32674152448 | 867243370.67 | 121673045.33 | 19.76 | 33.04 | 8866.79 | false |
volta_sgemm_128x64_nn | 11 | 1064.33 | 2.35 | 11823611904 | 351070122.67 | 91420032.00 | 20.98 | 26.72 | 11108.94 | false |
volta_sgemm_32x32_sliced1x4_tn | 0 | 28.67 | 0.06 | 67633152 | 4111626.67 | 929109.33 | 6.20 | 13.42 | 2359.27 | true |
Showing 1 to 19 of 19 entries