GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0 | 3.00 | 0.04 | 0 | 938.67 | 8778.67 | 7.30 | 0.00 | 0.00 | true |
void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 44 | 1894.33 | 27.81 | 1549930697 | 1302560.00 | 5053568.00 | 3.34 | 243.85 | 818.19 | false |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7 | 69.00 | 1.01 | 20229646 | 9557.33 | 1304618.67 | 40.69 | 15.39 | 293.18 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 4 | 31.00 | 0.46 | 492352 | 6634.67 | 1869589.33 | 37.97 | 0.26 | 15.88 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 26 | 124.67 | 1.83 | 37416960 | 25493.33 | 20886506.67 | 13.83 | 1.79 | 300.14 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 44 | 142.00 | 2.08 | 0 | 1962.67 | 2036458.67 | 48.84 | 0.00 | 0.00 | true |
void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 67 | 275.00 | 4.04 | 2921184 | 95637.33 | 208437.34 | 45.19 | 9.61 | 10.62 | true |
void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 0 | 3.00 | 0.04 | 24024 | 2048.00 | 42.67 | 6.20 | 11.49 | 8.01 | true |
void tensorflow::BiasNCHWKernel<float>(int, float const*, float const*, float*, int, int) | 1 | 11.33 | 0.17 | 803817 | 4640.00 | 1475178.67 | 47.14 | 0.54 | 70.93 | true |
void tensorflow::functor::PadInputCustomKernelNCHW<float, 4>(int, float const*, tensorflow::functor::Dimension<4>, float*, tensorflow::functor::Dimension<4>, tensorflow::functor::Dimension<(4)-(2)>) | 4 | 22.00 | 0.32 | 0 | 105386.67 | 385386.67 | 47.40 | 0.00 | 0.00 | true |
void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 0 | 8.00 | 0.12 | 10431 | 1962.67 | 128.00 | 2.30 | 4.99 | 1.30 | true |
void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 0 | 4.00 | 0.06 | 0 | 1450.67 | 256.00 | 4.00 | 0.00 | 0.00 | true |
void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 70 | 536.00 | 7.87 | 0 | 44735104.00 | 20160970.67 | 44.62 | 0.00 | 0.00 | true |
void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 0 | 6.00 | 0.09 | 0 | 3157.33 | 468170.67 | 59.00 | 0.00 | 0.00 | true |
volta_scudnn_128x64_relu_interior_nn_v1 | 0 | 17.33 | 0.25 | 40140800 | 11008.00 | 2279082.67 | 7.80 | 17.53 | 2315.86 | true |
volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 26 | 1037.33 | 15.23 | 1713725440 | 323680.00 | 9630528.00 | 12.98 | 172.16 | 1652.05 | false |
Showing 1 to 16 of 16 entries