GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 1 | 7.00 | 0.04 | 0 | 2912.00 | 18208.00 | 6.16 | 0.00 | 0.00 | true |
void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 93 | 6109.34 | 35.33 | 12821614176 | 16659562.67 | 34506314.67 | 3.85 | 250.59 | 2098.69 | false |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 14 | 198.67 | 1.15 | 91524913 | 50346.67 | 13958101.33 | 52.67 | 6.53 | 460.69 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 3 | 40.00 | 0.23 | 752768 | 4500394.67 | 4658058.67 | 53.40 | 0.08 | 18.82 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 15 | 66.00 | 0.38 | 5879808 | 10858.67 | 2733941.33 | 6.20 | 2.14 | 89.09 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 99 | 366.67 | 2.12 | 0 | 428192.00 | 11884746.67 | 50.92 | 0.00 | 0.00 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 148 | 641.00 | 3.71 | 15089184 | 10681376.00 | 13724853.34 | 49.32 | 0.62 | 23.54 | true |
void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 16 | 101.89 | 0.59 | 22658858.666 | 145461.33 | 2997920.00 | 2.39 | 7.21 | 222.39 | true |
void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 21 | 118.34 | 0.68 | 28526272 | 85482.67 | 2544789.33 | 2.35 | 10.85 | 241.06 | true |
void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16 | 248.00 | 1.43 | 21257472 | 3228661.33 | 10814069.33 | 2.12 | 1.51 | 85.72 | true |
void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 21 | 240.33 | 1.39 | 28662272 | 5501589.33 | 16685898.67 | 2.21 | 1.29 | 119.26 | true |
void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 36 | 656.67 | 3.80 | 604778496 | 1703658.67 | 196121450.67 | 18.84 | 3.06 | 920.98 | true |
void gemv2N_kernel<int, int, float, float, float, 128, 8, 4, 4, 1, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>) | 0 | 20.67 | 0.12 | 3202871 | 6157376.00 | 2072586.67 | 6.20 | 0.39 | 154.98 | true |
void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 0 | 4.00 | 0.02 | 24024 | 2368.00 | 2432.00 | 6.20 | 5.00 | 6.01 | true |
void tensorflow::BiasNHWCKernel<float>(int, float const*, float const*, float*, int) | 0 | 3.67 | 0.02 | 1001 | 5600.00 | 213.33 | 47.20 | 0.17 | 0.27 | true |
void tensorflow::functor::BlockReduceKernel<int*, int*, 256, tensorflow::functor::Prod<int> >(int*, int*, int, tensorflow::functor::Prod<int>, std::iterator_traits<int*>::value_type) | 0 | 3.67 | 0.02 | 0 | 3072.00 | 213.33 | 12.10 | 0.00 | 0.00 | true |
void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 0 | 8.33 | 0.05 | 10431 | 6656.00 | 42.67 | 2.40 | 1.56 | 1.25 | true |
void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 0 | 5.00 | 0.03 | 0 | 4096.00 | 85.33 | 3.90 | 0.00 | 0.00 | true |
void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 148 | 1376.33 | 7.96 | 0 | 164499242.67 | 71475285.34 | 43.74 | 0.00 | 0.00 | true |
void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 0 | 8.00 | 0.05 | 0 | 7050.67 | 74378.67 | 82.60 | 0.00 | 0.00 | true |
volta_gcgemm_32x32_nt | 2 | 35.56 | 0.21 | 124945920 | 37205.33 | 2005898.67 | 4.50 | 61.15 | 3514.06 | false |
volta_gcgemm_64x32_nt | 35 | 1208.67 | 6.99 | 11253836800 | 122315968.00 | 71728170.67 | 10.12 | 58.00 | 9310.94 | false |
volta_scudnn_128x64_relu_interior_nn_v1 | 0 | 183.00 | 1.06 | 849838080 | 3501642.67 | 2118293.33 | 6.20 | 151.22 | 4643.92 | false |
volta_scudnn_128x64_relu_small_nn_v1 | 0 | 58.67 | 0.34 | 308969472 | 24128.00 | 1498282.67 | 6.20 | 202.95 | 5266.50 | false |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 15 | 396.00 | 2.29 | 2641551360 | 4328170.67 | 20158645.34 | 15.76 | 107.88 | 6670.60 | false |
Showing 1 to 25 of 25 entries