GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 37 | 129.11 | 1.74 | 0 | 3562.67 | 1301610.66 | 6.32 | 0.00 | 0.00 | true |
void conv2d_c1_k1_nchw_hw_packed_kernel<float, float, 3>(cudnnTensorStruct, float const*, cudnnFilterStruct, float const*, cudnnConvolutionStruct, cudnnTensorStruct, float*, float, float, cudnn::reduced_divisor, cudnn::reduced_divisor, int) | 1 | 152.33 | 2.05 | 202309632 | 36479082.67 | 38658933.33 | 81.64 | 2.69 | 1328.07 | true |
void conv2d_grouped_direct_kernel<float, float, float, float, float, true, false, 0, 1, 3>(cudnnTensorStruct, float const*, cudnnFilterStruct, float const*, cudnnConvolutionStruct, cudnnTensorStruct, float*, float, float, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, cudnn::reduced_divisor, int, float const*, float const*, cudnnActivationStruct) | 8 | 280.33 | 3.77 | 203083776 | 70890698.67 | 51358826.67 | 85.23 | 1.66 | 724.44 | true |
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 26 | 1016.33 | 13.68 | 501325824 | 298807925.33 | 326414496.00 | 73.52 | 0.80 | 493.27 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 9.00 | 0.12 | 1051904 | 1898.67 | 322634.67 | 16.90 | 3.24 | 116.88 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0 | 4.00 | 0.05 | 64000 | 4501.33 | 310154.67 | 33.80 | 0.20 | 16.00 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 26 | 868.33 | 11.69 | 161366016 | 280105141.33 | 313039477.33 | 92.91 | 0.27 | 185.83 | true |
volta_scudnn_128x128_relu_interior_nn_v1 | 0 | 37.67 | 0.51 | 208732160 | 278058.67 | 6857685.33 | 15.40 | 29.25 | 5541.51 | false |
volta_scudnn_128x32_relu_interior_nn_v1 | 2 | 260.00 | 3.50 | 1104674816 | 64465642.67 | 102264426.67 | 23.00 | 6.63 | 4248.75 | true |
volta_scudnn_128x32_relu_small_nn_v1 | 24 | 1741.05 | 23.43 | 9505341440 | 115711296.00 | 66878186.67 | 23.13 | 52.06 | 5459.55 | false |
volta_scudnn_128x64_relu_interior_nn_v1 | 8 | 455.66 | 6.13 | 3333095424 | 25059754.67 | 69077077.33 | 13.93 | 35.41 | 7314.79 | false |
volta_sgemm_64x32_sliced1x4_tn | 0 | 15.67 | 0.21 | 34603008 | 1027242.67 | 369877.33 | 12.20 | 24.77 | 2208.66 | false |
Showing 1 to 12 of 12 entries