GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 107 | 360.33 | 0.07 | 0 | 18026.67 | 626837.33 | 5.80 | 0.00 | 0.00 | true |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 152 | 60124.66 | 11.26 | 34485829632 | 9624158144.00 | 12402388000.00 | 83.97 | 1.57 | 573.57 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 238.67 | 0.04 | 37040128 | 177505333.33 | 5618496.00 | 60.60 | 0.20 | 155.20 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 1663.00 | 0.31 | 51380224 | 179586197.33 | 140975904.00 | 72.30 | 0.16 | 30.90 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 2 | 16.67 | 0.00 | 712704 | 449344.00 | 812714.67 | 6.20 | 0.56 | 42.76 | true |
void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 36 | 9927.33 | 1.86 | 9059696640 | 1849671989.34 | 5421199040.00 | 46.96 | 1.25 | 912.60 | true |
void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 36 | 710.34 | 0.13 | 228261888 | 101468416.00 | 401535690.67 | 39.24 | 0.45 | 321.34 | true |
void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 36 | 9452.33 | 1.77 | 10299113472 | 5448045610.66 | 1955122090.66 | 48.20 | 1.39 | 1089.58 | true |
void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 6 | 2330.33 | 0.44 | 4932501504 | 998925269.33 | 774036448.00 | 47.46 | 2.78 | 2116.65 | true |
void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 6 | 2512.00 | 0.47 | 5178392576 | 724868832.00 | 998594677.33 | 47.89 | 3.00 | 2061.46 | true |
void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 6 | 756.00 | 0.14 | 2589196288 | 4192906.67 | 495577749.33 | 46.57 | 5.18 | 3424.86 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0 | 5.67 | 0.00 | 256000 | 6016.00 | 1024.00 | 59.90 | 36.36 | 45.17 | false |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>) | 0 | 3.67 | 0.00 | 0 | 1504.00 | 5632.00 | 12.10 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 49 | 48541.00 | 9.09 | 3365404672 | 2941065098.67 | 3604765034.66 | 96.17 | 0.51 | 69.33 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 151 | 57978.33 | 10.86 | 10879762432 | 9958465162.67 | 12238789845.34 | 98.34 | 0.49 | 187.65 | true |
volta_gcgemm_32x32_nt | 6 | 8207.00 | 1.54 | 98420654080 | 320192160.00 | 429414741.33 | 16.84 | 131.30 | 11992.28 | false |
volta_scudnn_128x128_relu_interior_nn_v1 | 2 | 12989.00 | 2.43 | 158199709696 | 642222805.33 | 307160608.00 | 16.50 | 166.63 | 12179.51 | false |
volta_scudnn_128x128_relu_small_nn_v1 | 1 | 10074.33 | 1.89 | 118444261376 | 55888181.33 | 47991925.33 | 15.32 | 1140.20 | 11757.03 | false |
volta_scudnn_128x64_relu_interior_nn_v1 | 100 | 217689.00 | 40.78 | 2725258461184 | 15127680778.66 | 12556348576.00 | 24.43 | 98.44 | 12519.05 | false |
volta_scudnn_128x64_relu_medium_nn_v1 | 0 | 5133.00 | 0.96 | 62889394176 | 11323178.67 | 287966005.33 | 12.70 | 210.13 | 12251.98 | false |
volta_scudnn_128x64_relu_small_nn_v1 | 0 | 4678.00 | 0.88 | 59215708160 | 69098794.67 | 20239242.67 | 14.60 | 662.83 | 12658.34 | false |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 2 | 8873.33 | 1.66 | 95567216640 | 229661141.33 | 432125066.67 | 24.80 | 144.41 | 10770.16 | false |
volta_sgemm_128x32_sliced1x4_tn | 0 | 139.00 | 0.03 | 1077936128 | 10693504.00 | 325760.00 | 12.50 | 97.82 | 7754.94 | false |
volta_sgemm_128x64_nn | 36 | 55811.67 | 10.46 | 716471009280 | 3086978218.67 | 2793039125.33 | 24.70 | 121.85 | 12837.30 | false |
Showing 1 to 24 of 24 entries