GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 73 | 256.00 | 0.07 | 0 | 16256.00 | 490794.66 | 5.79 | 0.00 | 0.00 | true |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 101 | 42948.66 | 11.48 | 24246484992 | 6432225685.33 | 8775772096.00 | 82.59 | 1.59 | 564.55 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 238.33 | 0.06 | 37017088 | 176778400.00 | 5639744.00 | 60.50 | 0.20 | 155.32 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 1668.00 | 0.45 | 51380224 | 177097909.33 | 140263498.67 | 72.30 | 0.16 | 30.80 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 2 | 16.00 | 0.00 | 712704 | 449600.00 | 814037.33 | 6.20 | 0.56 | 44.54 | true |
void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 23 | 6361.00 | 1.70 | 5788139520 | 1181142656.00 | 3463662453.33 | 46.88 | 1.25 | 909.94 | true |
void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 23 | 499.00 | 0.13 | 159252480 | 70793557.33 | 280135434.67 | 42.99 | 0.45 | 319.14 | true |
void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 23 | 5979.33 | 1.60 | 6795821056 | 3481293248.00 | 1251392522.67 | 48.22 | 1.44 | 1136.55 | true |
void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 2 | 1005.67 | 0.27 | 2113929216 | 428159850.67 | 331822080.00 | 47.53 | 2.78 | 2102.02 | true |
void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 2 | 1079.67 | 0.29 | 2219311104 | 310766389.33 | 428015200.00 | 47.87 | 3.00 | 2055.55 | true |
void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 2 | 328.67 | 0.09 | 1109655552 | 1810314.67 | 212381034.67 | 46.63 | 5.18 | 3376.23 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0 | 6.00 | 0.00 | 256000 | 5760.00 | 0.00 | 59.90 | 44.44 | 42.67 | false |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>) | 0 | 3.33 | 0.00 | 0 | 1504.00 | 5717.33 | 12.10 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 32 | 33009.33 | 8.83 | 2286419968 | 2085898250.67 | 2717975296.00 | 96.29 | 0.48 | 69.27 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 100 | 40772.00 | 10.90 | 7642808320 | 6657250165.33 | 8660278709.33 | 98.46 | 0.50 | 187.45 | true |
volta_gcgemm_32x32_nt | 2 | 3522.33 | 0.94 | 42180280320 | 138921717.33 | 187178709.33 | 16.97 | 129.35 | 11975.10 | false |
volta_scudnn_128x128_relu_interior_nn_v1 | 2 | 13031.67 | 3.48 | 158199709696 | 638549717.33 | 308768778.67 | 16.46 | 167.00 | 12139.64 | false |
volta_scudnn_128x128_relu_small_nn_v1 | 1 | 10076.00 | 2.69 | 118444261376 | 55551594.67 | 48108864.00 | 15.26 | 1142.62 | 11755.09 | false |
volta_scudnn_128x64_relu_interior_nn_v1 | 66 | 146443.66 | 39.16 | 1828134060032 | 9904047786.67 | 8717622581.33 | 24.22 | 98.17 | 12483.53 | false |
volta_scudnn_128x64_relu_medium_nn_v1 | 0 | 5133.00 | 1.37 | 62889394176 | 11424256.00 | 288808714.67 | 12.70 | 209.47 | 12251.98 | false |
volta_scudnn_128x64_relu_small_nn_v1 | 0 | 4675.67 | 1.25 | 59215708160 | 67374090.67 | 20237280.00 | 14.70 | 675.89 | 12664.65 | false |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 2 | 8874.67 | 2.37 | 95567216640 | 228331552.00 | 431761536.00 | 24.80 | 144.78 | 10768.54 | false |
volta_sgemm_128x32_sliced1x4_tn | 0 | 138.67 | 0.04 | 1077936128 | 10723221.33 | 208629.33 | 12.50 | 98.61 | 7773.56 | false |
volta_sgemm_128x64_nn | 23 | 36295.00 | 9.70 | 464724688896 | 1971264778.66 | 1781654058.67 | 24.70 | 123.83 | 12804.10 | false |
Showing 1 to 24 of 24 entries