GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 66 | 323.00 | 0.93 | 0 | 0.00 | 0.00 | 6.46 | 0.00 | 0.00 | true |
maxwell_scudnn_128x128_relu_interior_nn | 61 | 10488.67 | 30.18 | 9517760512 | 0.00 | 0.00 | 13.27 | 0.00 | 907.43 | true |
maxwell_scudnn_128x128_relu_small_nn | 0 | 317.33 | 0.91 | 264470528 | 0.00 | 0.00 | 12.50 | 0.00 | 833.42 | true |
maxwell_scudnn_128x64_relu_interior_nn | 2 | 268.67 | 0.77 | 237158400 | 0.00 | 0.00 | 9.72 | 0.00 | 882.73 | true |
maxwell_scudnn_128x64_relu_medium_nn | 0 | 179.00 | 0.52 | 239239168 | 0.00 | 0.00 | 21.10 | 0.00 | 1336.53 | true |
maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 29 | 3231.34 | 9.30 | 4435656704 | 0.00 | 0.00 | 14.37 | 0.00 | 1372.70 | true |
void cudnn::detail::activation_fw_4d_kernel<float, float, 128, 1, 4, cudnn::detail::relu_func<float, (cudnnNanPropagation_t)0, false> >(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, float, float, int, float) | 100 | 1206.33 | 3.47 | 14927360 | 0.00 | 0.00 | 51.40 | 0.00 | 12.37 | true |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 101 | 1489.67 | 4.29 | 94712832 | 0.00 | 0.00 | 69.52 | 0.00 | 63.58 | true |
void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 3 | 1470.00 | 4.23 | 834791936 | 0.00 | 0.00 | 7.24 | 0.00 | 567.89 | true |
void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 2 | 535.33 | 1.54 | 314873856 | 0.00 | 0.00 | 18.27 | 0.00 | 588.18 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 24.67 | 0.07 | 144598 | 0.00 | 0.00 | 11.90 | 0.00 | 5.86 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 61.33 | 0.18 | 200704 | 0.00 | 0.00 | 69.00 | 0.00 | 3.27 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 29 | 2582.67 | 7.43 | 117596160 | 0.00 | 0.00 | 49.58 | 0.00 | 45.53 | true |
void gemv2T_kernel_val<int, int, float, float, float, 128, 16, 2, 2, false, cublasGemvParams<cublasGemvTensor<float const>, cublasGemvTensor<float>, float> >(cublasGemvParams<cublasGemvTensor<float const>, cublasGemvTensor<float>, float>, float, float) | 0 | 87.67 | 0.25 | 4495000 | 0.00 | 0.00 | 48.50 | 0.00 | 51.27 | true |
void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 3 | 649.67 | 1.87 | 0 | 0.00 | 0.00 | 5.18 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0 | 5.33 | 0.02 | 1000 | 0.00 | 0.00 | 12.40 | 0.00 | 0.19 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>) | 0 | 5.00 | 0.01 | 0 | 0.00 | 0.00 | 11.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 32 | 711.00 | 2.05 | 8931328 | 0.00 | 0.00 | 81.84 | 0.00 | 12.56 | true |
Showing 1 to 18 of 18 entries