GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
compute_gemm_pointers(float2**, float2 const*, int, float2 const*, int, float2 const*, int, int) | 5 | 22.33 | 0.04 | 0 | 2176.00 | 78282.67 | 2.63 | 0.00 | 0.00 | true |
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 41 | 144.67 | 0.27 | 0 | 10218.67 | 91882.67 | 5.62 | 0.00 | 0.00 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 81.33 | 0.15 | 9240832 | 54464576.00 | 3549568.00 | 63.00 | 0.16 | 113.62 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 12 | 4928.00 | 9.24 | 181436416 | 633789152.00 | 521068042.67 | 66.94 | 0.16 | 36.82 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6 | 42.33 | 0.08 | 5374976 | 715946.67 | 4305408.00 | 13.37 | 1.07 | 126.97 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 29 | 4177.67 | 7.83 | 0 | 926342730.67 | 997180938.67 | 96.39 | 0.00 | 0.00 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 56 | 4344.33 | 8.14 | 412948480 | 1078396373.33 | 1150848000.00 | 48.68 | 0.19 | 95.05 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorSlicingOp<Eigen::array<int, 2ul> const, Eigen::array<int, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer> >, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const, Eigen::GpuDevice>, int) | 35 | 1652.09 | 3.10 | 0 | 576977002.66 | 578189600.00 | 95.52 | 0.00 | 0.00 | true |
void fft2d_c2r_16x16<float, false>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*) | 5 | 573.00 | 1.07 | 842096640 | 209564949.33 | 157340352.00 | 69.36 | 2.30 | 1469.63 | true |
void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 2 | 593.00 | 1.11 | 1145044992 | 231536832.00 | 180083029.33 | 47.15 | 2.78 | 1930.93 | true |
void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 11 | 569.29 | 1.07 | 1148760064 | 73668053.33 | 295973162.67 | 64.59 | 3.11 | 2017.90 | true |
void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 2 | 395.00 | 0.74 | 739770368 | 103695008.00 | 142780714.67 | 46.53 | 3.00 | 1872.84 | true |
void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 2 | 298.67 | 0.56 | 901595136 | 1446890.67 | 170368341.33 | 46.05 | 5.25 | 3018.74 | true |
void flip_filter<float, float>(float*, float const*, int, int, int, int) | 5 | 305.67 | 0.57 | 0 | 278858.67 | 6384960.00 | 4.12 | 0.00 | 0.00 | true |
void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 0 | 6.00 | 0.01 | 3075072 | 2560.00 | 170.67 | 37.80 | 1126.12 | 512.51 | false |
void tensorflow::BiasNCHWKernel<float>(int, float const*, float const*, float*, int, int) | 0 | 6.00 | 0.01 | 128128 | 6080.00 | 74.67 | 44.30 | 20.82 | 21.35 | false |
void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 0 | 322.00 | 0.60 | 0 | 79864416.00 | 78048373.33 | 47.60 | 0.00 | 0.00 | true |
void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 0 | 9.67 | 0.02 | 1291776 | 6656.00 | 85.33 | 6.20 | 191.62 | 133.63 | false |
void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 0 | 5.00 | 0.01 | 0 | 3840.00 | 213.33 | 6.20 | 0.00 | 0.00 | true |
void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 57 | 404.00 | 0.76 | 0 | 26664810.67 | 25462656.00 | 44.31 | 0.00 | 0.00 | true |
void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 0 | 275.33 | 0.52 | 0 | 77077600.00 | 79710400.00 | 93.30 | 0.00 | 0.00 | true |
volta_cgemm_32x32_tn | 5 | 2197.33 | 4.12 | 26130382848 | 184809898.67 | 143112096.00 | 23.56 | 79.68 | 11891.86 | false |
volta_gcgemm_32x32_nt | 2 | 1510.67 | 2.83 | 17199104000 | 117622304.00 | 120662517.33 | 16.88 | 72.18 | 11385.11 | false |
volta_scudnn_128x32_relu_interior_nn_v1 | 5 | 1569.67 | 2.94 | 14811955200 | 677660000.00 | 74096192.00 | 20.27 | 19.70 | 9436.37 | false |
volta_scudnn_128x64_relu_interior_nn_v1 | 31 | 8993.33 | 16.86 | 89883836416 | 2078775658.66 | 409815509.33 | 19.83 | 36.12 | 9994.50 | false |
volta_scudnn_128x64_relu_medium_nn_v1 | 0 | 2582.67 | 4.84 | 31444697088 | 11255360.00 | 262971498.67 | 24.90 | 114.67 | 12175.28 | false |
volta_scudnn_128x64_relu_small_nn_v1 | 2 | 778.67 | 1.46 | 6943555584 | 1257888.00 | 8415125.33 | 16.13 | 717.83 | 8917.25 | false |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 6 | 5586.00 | 10.47 | 60254322688 | 61305589.33 | 195721738.67 | 17.65 | 234.43 | 10786.67 | false |
Showing 1 to 28 of 28 entries