Navigation :

GPU Kernel Information Aggregated by Name

Search:

kernel_name	kernel_count	kernel_duration (us)	model_duration_percentage	kernel_flops	kernel_dram_read_bytes	kernel_dram_write_bytes	kernel_achieved_occupancy (%)	kernel_arithmetic_intensity (flops/byte)	kernel_arithmetic_throughput (GFlops)	kernel_memory_bound

kernel_name	kernel_count	kernel_duration (us)	model_duration_percentage	kernel_flops	kernel_dram_read_bytes	kernel_dram_write_bytes	kernel_achieved_occupancy (%)	kernel_arithmetic_intensity (flops/byte)	kernel_arithmetic_throughput (GFlops)	kernel_memory_bound
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams)	104	356.33	0.07	0	31413.33	708437.34	5.79	0.00	0.00	true
void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const, cudnnTensorStruct, float, cudnnTensorStruct, float const, float const, float const, float const, float)	154	62468.66	11.84	35527065600	9600200586.67	12835024746.66	82.86	1.58	568.72	true
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const, cudnnTensorStruct, float, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor)	0	233.33	0.04	32017408	172494282.67	5624234.67	60.80	0.18	137.22	true
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const, cudnnTensorStruct, float, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor)	0	1669.00	0.32	51380224	177005525.33	137962282.67	72.30	0.16	30.79	true
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>)	2	16.67	0.00	712704	449600.00	969824.00	6.20	0.50	42.76	true
void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>)	38	10375.33	1.97	9437184000	1926095264.00	5647166570.66	46.88	1.25	909.58	true
void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>)	38	797.00	0.15	254803968	113283840.00	448739690.67	42.34	0.45	319.70	true
void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>)	38	9922.33	1.88	11002183680	5675807402.67	2040986741.33	48.16	1.43	1108.83	true
void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float, float2 const, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float, float, int2, int, int)	7	2681.00	0.51	5637144576	1141735797.33	884368992.00	47.48	2.78	2102.63	true
void fft2d_r2c_32x32<float, false, 0u, false>(float2, float const, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)	7	2865.00	0.54	5918162944	828610602.67	1141630794.67	47.86	3.00	2065.68	true
void fft2d_r2c_32x32<float, false, 1u, true>(float2, float const, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int)	7	866.00	0.16	2959081472	4841728.00	566027850.67	46.61	5.18	3416.95	true
void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>)	0	6.00	0.00	256000	5760.00	0.00	60.10	44.44	42.67	false
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float, float, float, mxnet::OpReqType>(int, float, float, float, mxnet::OpReqType)	49	48687.67	9.23	3365404672	2946838805.33	3606094080.00	96.15	0.51	69.12	true
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float, cudnnTensorStruct, float const, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool)	99	47137.67	8.94	21033779200	5611634165.33	7444023125.33	97.28	1.61	446.22	true
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float, cudnnTensorStruct, float const, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool)	100	20090.33	3.81	3776446464	6053071733.34	6469307018.66	97.04	0.30	187.97	true
volta_gcgemm_32x32_nt	7	9380.00	1.78	112480747520	362066176.00	486716192.00	16.91	132.52	11991.55	false
volta_scudnn_128x128_relu_interior_nn_v1	5	16400.33	3.11	197749637120	1009529461.33	378836565.33	18.05	142.43	12057.66	false
volta_scudnn_128x64_relu_interior_nn_v1	97	205387.00	38.94	2567058751488	14688594016.00	12240012778.67	24.82	95.33	12498.64	false
volta_scudnn_128x64_relu_medium_nn_v1	0	5135.00	0.97	62889394176	11270048.00	290839786.67	12.70	208.17	12247.20	false
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1	2	8881.67	1.68	95567216640	229960842.67	432139114.67	24.80	144.34	10760.05	false
volta_sgemm_128x32_sliced1x4_tn	0	139.00	0.03	1077936128	10704800.00	224661.33	12.50	98.63	7754.94	false
volta_sgemm_128x64_nn	38	59016.34	11.19	755182338048	3229886549.33	2906577493.33	24.70	123.06	12796.16	false

Showing 1 to 22 of 22 entries

Download as CSV