GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
compute_gemm_pointers(float2**, float2 const*, int, float2 const*, int, float2 const*, int, int) | 1 | 4.00 | 0.06 | 0 | 1472.00 | 24917.33 | 2.65 | 0.00 | 0.00 | true |
void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 38 | 1236.67 | 17.74 | 1229907408 | 259317.33 | 5531776.00 | 3.81 | 212.38 | 994.53 | false |
void cudnn::detail::implicit_convolve_sgemm<float, float, 128, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0 | 88.67 | 1.27 | 17047200 | 18944.00 | 330.67 | 3.10 | 884.44 | 192.26 | false |
void cudnn::detail::lrnForward_evenC<5, float, float>(cudnn::detail::LrnForwardParams<float, float>) | 1 | 143.00 | 2.05 | 8555008 | 13706.67 | 3141685.33 | 12.20 | 2.71 | 59.83 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0 | 10.00 | 0.14 | 66479 | 12800.00 | 4693.33 | 10.70 | 3.80 | 6.65 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 12 | 74.33 | 1.07 | 1417472 | 58069.33 | 3038496.00 | 42.92 | 0.46 | 19.07 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 9 | 58.00 | 0.83 | 20074496 | 233237.33 | 14732170.67 | 18.78 | 1.34 | 346.11 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 29 | 93.00 | 1.33 | 0 | 14506.67 | 2909845.34 | 50.92 | 0.00 | 0.00 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 56 | 195.33 | 2.80 | 3226160 | 437397.33 | 3480597.33 | 46.34 | 0.82 | 16.52 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorPaddingOp<Eigen::array<Eigen::IndexPair<int>, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 3, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorPaddingOp<Eigen::array<Eigen::IndexPair<int>, 3ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 3, 1, int>, 16, Eigen::MakePointer> const> const> const, Eigen::GpuDevice>, int) | 18 | 107.67 | 1.54 | 0 | 53589.33 | 43392.00 | 45.78 | 0.00 | 0.00 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 4, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorPaddingOp<Eigen::array<Eigen::IndexPair<int>, 4ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 4, 1, int>, 16, Eigen::MakePointer> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 4, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorPaddingOp<Eigen::array<Eigen::IndexPair<int>, 4ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 4, 1, int>, 16, Eigen::MakePointer> const> const> const, Eigen::GpuDevice>, int) | 13 | 75.67 | 1.09 | 0 | 366890.67 | 7034357.33 | 59.15 | 0.00 | 0.00 | true |
void fft2d_c2r_16x16<float, false>(float*, float2*, int, int, int, int, int, int, int, int, int, int, float, float, int, float*, float*) | 1 | 16.00 | 0.23 | 1145088 | 17472.00 | 89578.67 | 12.50 | 10.70 | 71.57 | true |
void fft2d_c2r_32x32<float, false, false, 1u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 4 | 66.00 | 0.95 | 8186528 | 393728.00 | 381152.00 | 24.80 | 10.56 | 124.04 | true |
void fft2d_r2c_16x16<float>(float2*, float const*, int, int, int, int, int, int, int, int) | 3 | 37.00 | 0.53 | 45490560 | 52864.00 | 9160810.67 | 25.71 | 4.94 | 1229.47 | true |
void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 4 | 67.67 | 0.97 | 2889728 | 215584.00 | 1027744.00 | 22.52 | 2.32 | 42.71 | true |
void fft2d_r2c_32x32<float, false, 5u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 4 | 90.00 | 1.29 | 190894080 | 107264.00 | 35572117.33 | 33.62 | 5.35 | 2121.05 | true |
void flip_filter<float, float>(float*, float const*, int, int, int, int) | 1 | 21.00 | 0.30 | 0 | 1621.33 | 901120.00 | 4.30 | 0.00 | 0.00 | true |
void gemv2N_kernel<int, int, float, float, float, 128, 8, 4, 4, 1, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>) | 0 | 15.00 | 0.22 | 2135512 | 4103168.00 | 848597.33 | 6.20 | 0.43 | 142.37 | true |
void gemv2N_kernel<int, int, float2, float2, float2, 128, 8, 4, 4, 1, cublasGemvParams<cublasGemvTensorStridedBatched<float2 const>, cublasGemvTensorStridedBatched<float2>, float2> >(cublasGemvParams<cublasGemvTensorStridedBatched<float2 const>, cublasGemvTensorStridedBatched<float2>, float2>) | 4 | 86.33 | 1.24 | 51492864 | 28293674.67 | 10512885.33 | 52.99 | 1.33 | 596.44 | true |
void gemv2T_kernel<int, int, float2, float2, float2, 128, 16, 2, 2, false, cublasGemvParams<cublasGemvTensorBatched<float2 const>, cublasGemvTensorBatched<float2>, float2> >(cublasGemvParams<cublasGemvTensorBatched<float2 const>, cublasGemvTensorBatched<float2>, float2>) | 1 | 25.00 | 0.36 | 15187968 | 2690314.67 | 2487594.67 | 48.79 | 2.93 | 607.52 | true |
void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 0 | 3.00 | 0.04 | 24000 | 2453.33 | 2389.33 | 6.20 | 4.96 | 8.00 | true |
void tensorflow::BiasNHWCKernel<float>(int, float const*, float const*, float*, int) | 0 | 3.00 | 0.04 | 1000 | 5568.00 | 341.33 | 47.20 | 0.17 | 0.33 | true |
void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 0 | 8.00 | 0.11 | 10421 | 6656.00 | 256.00 | 2.50 | 1.51 | 1.30 | true |
void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 0 | 4.00 | 0.06 | 0 | 3584.00 | 0.00 | 3.80 | 0.00 | 0.00 | true |
void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 56 | 315.67 | 4.53 | 0 | 23870442.67 | 8318336.00 | 44.62 | 0.00 | 0.00 | true |
void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 0 | 6.00 | 0.09 | 0 | 6944.00 | 564469.33 | 61.80 | 0.00 | 0.00 | true |
void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 256, 32, 32, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 26 | 121.00 | 1.74 | 0 | 68384.00 | 15021930.67 | 30.16 | 0.00 | 0.00 | true |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 9 | 394.00 | 5.65 | 1194016768 | 943648.00 | 8453856.00 | 13.67 | 127.06 | 3030.50 | false |
Showing 1 to 28 of 28 entries