GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 70 | 248.00 | 0.94 | 0 | 98592.00 | 195285.33 | 6.21 | 0.00 | 0.00 | true |
maxwell_scudnn_128x128_relu_interior_nn | 66 | 9280.67 | 35.12 | 9953869824 | 65200565.33 | 31098421.33 | 12.99 | 103.36 | 1072.54 | false |
maxwell_scudnn_128x64_relu_interior_nn | 2 | 240.67 | 0.91 | 237158400 | 7983616.00 | 2720512.00 | 7.88 | 22.16 | 985.43 | true |
maxwell_scudnn_128x64_relu_medium_nn | 0 | 130.00 | 0.49 | 239239168 | 0.00 | 320.00 | 19.00 | 747622.40 | 1840.30 | false |
maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 32 | 2759.67 | 10.44 | 4995989504 | 111178112.00 | 35070079.99 | 13.83 | 34.16 | 1810.36 | false |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 103 | 1053.33 | 3.99 | 98780160 | 23755274.67 | 61402677.33 | 64.33 | 1.16 | 93.78 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 17.00 | 0.06 | 144148 | 12352.00 | 1824.00 | 11.90 | 10.17 | 8.48 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 52.00 | 0.20 | 200704 | 4046453.33 | 1630208.00 | 71.00 | 0.04 | 3.86 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 32 | 1943.00 | 7.35 | 137551872 | 75970026.66 | 112481760.00 | 58.63 | 0.73 | 70.79 | true |
void gemv2T_kernel_val<int, int, float, float, float, 128, 16, 2, 2, false, cublasGemvParams<cublasGemvTensor<float const>, cublasGemvTensor<float>, float> >(cublasGemvParams<cublasGemvTensor<float const>, cublasGemvTensor<float>, float>, float, float) | 0 | 69.67 | 0.26 | 4495000 | 9712416.00 | 943957.33 | 39.00 | 0.42 | 64.52 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0 | 3.00 | 0.01 | 1000 | 6176.00 | 0.00 | 12.40 | 0.16 | 0.33 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 32 | 589.00 | 2.23 | 8931328 | 57383274.67 | 23980949.33 | 82.84 | 0.11 | 15.16 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 65 | 916.00 | 3.47 | 55820800 | 10330506.67 | 18049898.66 | 74.89 | 1.97 | 60.94 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 66 | 325.33 | 1.23 | 10536960 | 3265909.33 | 5563968.00 | 82.19 | 1.19 | 32.39 | true |
Showing 1 to 14 of 14 entries