GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 107 | 326.67 | 0.40 | 0 | 7296.00 | 219189.31 | 6.94 | 0.00 | 0.00 | true |
maxwell_scudnn_128x128_relu_interior_nn | 7 | 463.33 | 0.57 | 229834752 | 3034677.33 | 590101.33 | 12.50 | 63.41 | 496.05 | false |
maxwell_scudnn_128x32_relu_interior_nn | 0 | 25.00 | 0.03 | 13647872 | 13813.33 | 1721482.67 | 29.40 | 7.86 | 545.91 | true |
maxwell_scudnn_128x32_relu_small_nn | 96 | 1113.50 | 1.38 | 156188672 | 193866.66 | 933813.34 | 7.33 | 138.50 | 140.27 | false |
maxwell_scudnn_128x64_relu_interior_nn | 1 | 61.00 | 0.08 | 40140800 | 52426.67 | 1375584.00 | 7.85 | 28.11 | 658.05 | true |
maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148t_nt | 1999 | 22592.00 | 27.99 | 2321022976 | 0.00 | 2001098.70 | 12.91 | 1159.87 | 102.74 | false |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 26 | 182.67 | 0.23 | 15666432 | 961696.00 | 6563690.67 | 42.77 | 2.08 | 85.76 | true |
void cudnn::detail::explicit_convolve_sgemm<float, int, 1024, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 1 | 97.00 | 0.12 | 50381824 | 3160213.33 | 356245.33 | 5.00 | 14.33 | 519.40 | true |
void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 383 | 2688.00 | 3.33 | 46174976 | 24576.00 | 7936.00 | 3.10 | 1420.24 | 17.18 | false |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 18.00 | 0.02 | 32287 | 12096.00 | 1696.00 | 10.90 | 2.34 | 1.79 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 1999 | 10000.00 | 12.39 | 29696000 | 128000.00 | 17194.42 | 6.20 | 204.53 | 2.97 | false |
void gemv2T_kernel_val<int, int, float, float, float, 128, 16, 2, 2, false, cublasGemvParams<cublasGemvTensor<float const>, cublasGemvTensor<float>, float> >(cublasGemvParams<cublasGemvTensor<float const>, cublasGemvTensor<float>, float>, float, float) | 0 | 23.33 | 0.03 | 1135000 | 2444256.00 | 376672.00 | 38.70 | 0.40 | 48.64 | true |
void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 385 | 2803.72 | 3.47 | 0 | 106.67 | 2538.62 | 8.71 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0 | 3.00 | 0.00 | 1000 | 5877.33 | 554.67 | 12.40 | 0.16 | 0.33 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 26 | 126.67 | 0.16 | 5042688 | 809472.00 | 2755434.67 | 81.84 | 1.41 | 39.81 | true |
Showing 1 to 15 of 15 entries