GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 37 | 125.67 | 0.90 | 0 | 44106.66 | 252618.67 | 6.61 | 0.00 | 0.00 | true |
maxwell_scudnn_128x128_relu_interior_nn | 32 | 4500.00 | 32.17 | 5833359360 | 30732170.67 | 29301952.00 | 13.60 | 97.17 | 1296.30 | false |
maxwell_scudnn_128x128_relu_small_nn | 0 | 249.33 | 1.78 | 264470528 | 0.00 | 59104.00 | 12.50 | 4474.66 | 1060.71 | false |
maxwell_scudnn_128x64_relu_interior_nn | 2 | 240.67 | 1.72 | 237158400 | 7496768.00 | 3312309.33 | 7.89 | 21.94 | 985.43 | true |
maxwell_scudnn_128x64_relu_medium_nn | 0 | 131.00 | 0.94 | 239239168 | 0.00 | 320.00 | 19.00 | 747622.40 | 1826.25 | false |
maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 12 | 1087.00 | 7.77 | 2054242304 | 24325717.33 | 11958656.00 | 15.43 | 56.62 | 1889.83 | false |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 50 | 684.66 | 4.89 | 62333952 | 23597024.00 | 39992160.00 | 69.92 | 0.98 | 91.04 | true |
void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 1 | 736.00 | 5.26 | 566306304 | 5167274.67 | 1165856.00 | 6.18 | 89.42 | 769.44 | false |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 17.00 | 0.12 | 144028 | 12320.00 | 426.67 | 11.90 | 11.30 | 8.47 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 52.00 | 0.37 | 200704 | 4048842.67 | 1636469.33 | 70.90 | 0.04 | 3.86 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12 | 756.00 | 5.40 | 52977664 | 18534794.67 | 26920234.67 | 56.88 | 1.17 | 70.08 | true |
void gemv2T_kernel_val<int, int, float, float, float, 128, 16, 2, 2, false, cublasGemvParams<cublasGemvTensor<float const>, cublasGemvTensor<float>, float> >(cublasGemvParams<cublasGemvTensor<float const>, cublasGemvTensor<float>, float>, float, float) | 0 | 70.00 | 0.50 | 4495000 | 9702997.33 | 944469.33 | 39.00 | 0.42 | 64.21 | true |
void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 1 | 206.00 | 1.47 | 0 | 223552.00 | 2573632.00 | 7.37 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0 | 3.33 | 0.02 | 1000 | 6112.00 | 0.00 | 12.40 | 0.16 | 0.30 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>) | 0 | 3.00 | 0.02 | 0 | 1440.00 | 0.00 | 10.60 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 15 | 387.33 | 2.77 | 5519360 | 39583317.33 | 17625717.33 | 83.89 | 0.10 | 14.25 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 49 | 468.33 | 3.35 | 19618816 | 18930346.67 | 25614144.00 | 90.30 | 0.44 | 41.89 | true |
Showing 1 to 17 of 17 entries