GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 56 | 3040.33 | 30.87 | 4726873065 | 14298005.33 | 16504789.33 | 3.36 | 153.46 | 1554.72 | false |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9 | 110.00 | 1.12 | 44446259 | 19520.00 | 4530485.33 | 48.73 | 9.77 | 404.06 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 3 | 32.00 | 0.32 | 708640 | 315541.33 | 3746368.00 | 47.02 | 0.17 | 22.14 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11 | 84.67 | 0.86 | 24053760 | 9838464.00 | 21527082.67 | 31.53 | 0.77 | 284.10 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 62 | 233.67 | 2.37 | 0 | 18229.33 | 10040320.00 | 51.43 | 0.00 | 0.00 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 93 | 399.33 | 4.05 | 8967488 | 3189472.00 | 6778997.34 | 48.74 | 0.90 | 22.46 | true |
void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 12 | 74.00 | 0.75 | 12718720 | 181290.67 | 3948693.33 | 1.84 | 3.08 | 171.87 | true |
void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 12 | 77.00 | 0.78 | 12718720 | 141632.00 | 4146698.67 | 1.85 | 2.97 | 165.18 | true |
void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 12 | 172.00 | 1.75 | 12781824 | 1447328.00 | 5386645.33 | 1.79 | 1.87 | 74.31 | true |
void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 12 | 127.67 | 1.30 | 12781824 | 1294944.00 | 6291733.33 | 1.79 | 1.68 | 100.12 | true |
void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 25 | 321.67 | 3.27 | 262471680 | 35136.00 | 71980277.33 | 13.99 | 3.64 | 815.97 | true |
void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 0 | 3.67 | 0.04 | 24024 | 2304.00 | 640.00 | 6.20 | 8.16 | 6.55 | true |
void tensorflow::BiasNCHWKernel<float>(int, float const*, float const*, float*, int, int) | 0 | 4.00 | 0.04 | 1001 | 6304.00 | 128.00 | 47.60 | 0.16 | 0.25 | true |
void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 0 | 8.00 | 0.08 | 10431 | 6720.00 | 0.00 | 2.40 | 1.55 | 1.30 | true |
void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 0 | 4.67 | 0.05 | 0 | 4288.00 | 85.33 | 4.00 | 0.00 | 0.00 | true |
void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 94 | 822.66 | 8.35 | 0 | 95759765.33 | 47059104.00 | 44.45 | 0.00 | 0.00 | true |
void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 0 | 8.00 | 0.08 | 0 | 6741.33 | 125173.33 | 85.40 | 0.00 | 0.00 | true |
volta_gcgemm_64x32_nt | 25 | 656.00 | 6.66 | 4904529920 | 1155424.00 | 20468266.67 | 7.87 | 226.81 | 7476.42 | false |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 11 | 492.33 | 5.00 | 2606757888 | 26101002.67 | 27544192.00 | 15.55 | 48.59 | 5294.70 | false |
Showing 1 to 19 of 19 entries