GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 13 | 48.66 | 0.35 | 0 | 1941.33 | 422656.00 | 5.97 | 0.00 | 0.00 | true |
void conv2d_c1_k1_nchw_hw_packed_kernel<float, float, 3>(cudnnTensorStruct, float const*, cudnnFilterStruct, float const*, cudnnConvolutionStruct, cudnnTensorStruct, float*, float, float, cudnn::reduced_divisor, cudnn::reduced_divisor, int) | 2 | 709.33 | 5.11 | 1011548160 | 193431264.00 | 193015253.33 | 69.99 | 2.62 | 1426.06 | true |
void conv2d_grouped_direct_kernel<float, float, float, float, float, true, false, 0, 1, 3>(cudnnTensorStruct, float const*, cudnnFilterStruct, float const*, cudnnConvolutionStruct, cudnnTensorStruct, float*, float, float, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, int, float const*, float const*, cudnnActivationStruct) | 9 | 1017.00 | 7.33 | 790235136 | 395113898.67 | 185685248.00 | 85.73 | 1.36 | 777.03 | true |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 26 | 2802.34 | 20.19 | 1503977472 | 965409066.67 | 981786688.00 | 80.14 | 0.77 | 536.69 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 30.00 | 0.22 | 3136512 | 14867285.33 | 3142613.33 | 50.40 | 0.17 | 104.55 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0 | 4.00 | 0.03 | 64000 | 4736.00 | 0.00 | 34.40 | 13.51 | 16.00 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 26 | 2585.67 | 18.62 | 484098048 | 960852224.00 | 964906890.66 | 96.08 | 0.25 | 187.22 | true |
volta_scudnn_128x32_relu_interior_nn_v1 | 0 | 279.00 | 2.01 | 1888223232 | 115703850.67 | 77040053.33 | 23.90 | 9.80 | 6767.83 | true |
volta_scudnn_128x32_relu_small_nn_v1 | 0 | 201.00 | 1.45 | 1490026496 | 38772906.67 | 79432682.67 | 24.00 | 12.61 | 7413.07 | true |
volta_scudnn_128x64_relu_interior_nn_v1 | 11 | 3539.33 | 25.49 | 39217004544 | 987901994.67 | 456906698.67 | 22.80 | 27.14 | 11080.34 | false |
volta_sgemm_64x32_sliced1x4_tn | 0 | 23.67 | 0.17 | 101711872 | 3072949.33 | 4010.67 | 12.30 | 33.06 | 4297.62 | false |
Showing 1 to 11 of 11 entries