GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
compute_gemm_pointers(float2**, float2 const*, int, float2 const*, int, float2 const*, int, int) | 14 | 53.67 | 0.08 | 0 | 4416.00 | 193194.67 | 2.61 | 0.00 | 0.00 | true |
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 46 | 167.33 | 0.25 | 0 | 13525.33 | 476330.67 | 5.75 | 0.00 | 0.00 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7 | 3902.00 | 5.72 | 2589394688 | 193430272.00 | 188909493.33 | 64.38 | 6.77 | 663.61 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 4 | 2043.00 | 2.99 | 63021056 | 239026016.00 | 106397013.33 | 69.91 | 0.18 | 30.85 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4 | 38.67 | 0.06 | 8433664 | 246261.33 | 6672458.67 | 17.36 | 1.22 | 218.11 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 44 | 4843.33 | 7.09 | 0 | 1246108736.00 | 1319398976.00 | 95.72 | 0.00 | 0.00 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 67 | 4005.00 | 5.87 | 373911552 | 1273097674.67 | 1322349066.66 | 48.11 | 0.14 | 93.36 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 37 | 1675.19 | 2.45 | 0 | 589831082.66 | 591060042.67 | 95.54 | 0.00 | 0.00 | true |
void fft2d_c2r_16x16<float, false>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*) | 14 | 1047.33 | 1.53 | 1539084288 | 380075658.66 | 235849674.67 | 67.62 | 2.50 | 1469.53 | true |
void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 5 | 857.00 | 1.26 | 1585446912 | 319895754.67 | 249702197.33 | 46.52 | 2.78 | 1850.00 | true |
void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 29 | 1505.00 | 2.20 | 3137085440 | 194317408.00 | 810140192.00 | 65.72 | 3.12 | 2084.44 | true |
void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 5 | 696.00 | 1.02 | 1294598144 | 181428650.67 | 249843488.00 | 46.82 | 3.00 | 1860.05 | true |
void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 5 | 332.67 | 0.49 | 970948608 | 931818.67 | 183372565.33 | 44.83 | 5.27 | 2918.67 | true |
void flip_filter<float, float>(float*, float const*, int, int, int, int) | 14 | 732.67 | 1.07 | 0 | 8362.67 | 17555498.67 | 4.01 | 0.00 | 0.00 | true |
void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 0 | 5.67 | 0.01 | 3075072 | 2560.00 | 256.00 | 37.70 | 1092.00 | 542.63 | false |
void tensorflow::BiasNCHWKernel<float>(int, float const*, float const*, float*, int, int) | 1 | 1313.33 | 1.92 | 102888576 | 116611178.67 | 232125269.33 | 48.28 | 0.30 | 78.34 | true |
void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 4 | 929.33 | 1.36 | 0 | 209626784.00 | 220147968.00 | 47.54 | 0.00 | 0.00 | true |
void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 0 | 9.00 | 0.01 | 1291776 | 6656.00 | 384.00 | 6.20 | 183.49 | 143.53 | false |
void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 0 | 5.00 | 0.01 | 0 | 3840.00 | 682.67 | 6.20 | 0.00 | 0.00 | true |
void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 70 | 568.33 | 0.83 | 0 | 44758037.33 | 49647253.33 | 44.54 | 0.00 | 0.00 | true |
void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 0 | 274.67 | 0.40 | 0 | 77077994.67 | 79643914.67 | 93.30 | 0.00 | 0.00 | true |
volta_cgemm_32x32_tn | 14 | 5280.67 | 7.74 | 62349115392 | 525989674.67 | 263971882.67 | 23.30 | 78.93 | 11807.05 | false |
volta_gcgemm_32x32_nt | 5 | 1718.00 | 2.52 | 18600099840 | 436284010.67 | 323943872.00 | 16.66 | 24.47 | 10826.60 | false |
volta_scudnn_128x32_relu_interior_nn_v1 | 9 | 2981.00 | 4.37 | 31346753536 | 1286276384.00 | 116603573.33 | 20.64 | 22.34 | 10515.51 | false |
volta_scudnn_128x64_relu_interior_nn_v1 | 32 | 12146.67 | 17.79 | 128702398464 | 2241297301.33 | 517195210.67 | 20.58 | 46.66 | 10595.70 | false |
volta_scudnn_128x64_relu_medium_nn_v1 | 2 | 3116.11 | 4.56 | 35144073216 | 11277237.33 | 46985429.33 | 24.70 | 603.20 | 11278.18 | false |
volta_scudnn_128x64_relu_small_nn_v1 | 0 | 454.33 | 0.67 | 4627431424 | 3303637.33 | 4775690.67 | 15.50 | 572.75 | 10185.11 | false |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 4 | 6385.67 | 9.35 | 70886359040 | 101450272.00 | 187807146.67 | 18.56 | 245.06 | 11100.86 | false |
Showing 1 to 28 of 28 entries