GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 104 | 356.33 | 0.07 | 0 | 31413.33 | 708437.34 | 5.79 | 0.00 | 0.00 | true |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 154 | 62468.66 | 11.84 | 35527065600 | 9600200586.67 | 12835024746.66 | 82.86 | 1.58 | 568.72 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 233.33 | 0.04 | 32017408 | 172494282.67 | 5624234.67 | 60.80 | 0.18 | 137.22 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 1669.00 | 0.32 | 51380224 | 177005525.33 | 137962282.67 | 72.30 | 0.16 | 30.79 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 2 | 16.67 | 0.00 | 712704 | 449600.00 | 969824.00 | 6.20 | 0.50 | 42.76 | true |
void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 38 | 10375.33 | 1.97 | 9437184000 | 1926095264.00 | 5647166570.66 | 46.88 | 1.25 | 909.58 | true |
void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 38 | 797.00 | 0.15 | 254803968 | 113283840.00 | 448739690.67 | 42.34 | 0.45 | 319.70 | true |
void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 38 | 9922.33 | 1.88 | 11002183680 | 5675807402.67 | 2040986741.33 | 48.16 | 1.43 | 1108.83 | true |
void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 7 | 2681.00 | 0.51 | 5637144576 | 1141735797.33 | 884368992.00 | 47.48 | 2.78 | 2102.63 | true |
void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 7 | 2865.00 | 0.54 | 5918162944 | 828610602.67 | 1141630794.67 | 47.86 | 3.00 | 2065.68 | true |
void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 7 | 866.00 | 0.16 | 2959081472 | 4841728.00 | 566027850.67 | 46.61 | 5.18 | 3416.95 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0 | 6.00 | 0.00 | 256000 | 5760.00 | 0.00 | 60.10 | 44.44 | 42.67 | false |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 49 | 48687.67 | 9.23 | 3365404672 | 2946838805.33 | 3606094080.00 | 96.15 | 0.51 | 69.12 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 99 | 47137.67 | 8.94 | 21033779200 | 5611634165.33 | 7444023125.33 | 97.28 | 1.61 | 446.22 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 100 | 20090.33 | 3.81 | 3776446464 | 6053071733.34 | 6469307018.66 | 97.04 | 0.30 | 187.97 | true |
volta_gcgemm_32x32_nt | 7 | 9380.00 | 1.78 | 112480747520 | 362066176.00 | 486716192.00 | 16.91 | 132.52 | 11991.55 | false |
volta_scudnn_128x128_relu_interior_nn_v1 | 5 | 16400.33 | 3.11 | 197749637120 | 1009529461.33 | 378836565.33 | 18.05 | 142.43 | 12057.66 | false |
volta_scudnn_128x64_relu_interior_nn_v1 | 97 | 205387.00 | 38.94 | 2567058751488 | 14688594016.00 | 12240012778.67 | 24.82 | 95.33 | 12498.64 | false |
volta_scudnn_128x64_relu_medium_nn_v1 | 0 | 5135.00 | 0.97 | 62889394176 | 11270048.00 | 290839786.67 | 12.70 | 208.17 | 12247.20 | false |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 2 | 8881.67 | 1.68 | 95567216640 | 229960842.67 | 432139114.67 | 24.80 | 144.34 | 10760.05 | false |
volta_sgemm_128x32_sliced1x4_tn | 0 | 139.00 | 0.03 | 1077936128 | 10704800.00 | 224661.33 | 12.50 | 98.63 | 7754.94 | false |
volta_sgemm_128x64_nn | 38 | 59016.34 | 11.19 | 755182338048 | 3229886549.33 | 2906577493.33 | 24.70 | 123.06 | 12796.16 | false |
Showing 1 to 22 of 22 entries