GPU Kernel Information Aggregated by Name
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
kernel_name | kernel_count | kernel_duration (us) | model_duration_percentage | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|
cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 7 | 44.00 | 0.06 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
maxwell_scudnn_128x128_relu_interior_nn | 4 | 1218.00 | 1.66 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
maxwell_scudnn_128x64_relu_small_nn | 0 | 867.33 | 1.18 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 23 | 32412.33 | 44.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
mxnet::op::nms_calculate_batch_start_kernel(int*, int*, unsigned long, int) | 0 | 4.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void cudnn::detail::explicit_convolve_sgemm<float, int, 1024, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 4 | 1076.00 | 1.47 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 3 | 896.00 | 1.22 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void cudnn::detail::implicit_convolve_sgemm<float, float, 512, 6, 8, 3, 3, 5, 1, true, false, false>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0 | 4140.33 | 5.64 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 5 | 1489.33 | 2.03 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 21 | 2241.33 | 3.05 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 8 | 918.67 | 1.25 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 1, int>, mshadow::gpu, int, 1, 1>, int>, mshadow::expr::Plan<mshadow::expr::ScalarExp<int>, int> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 1, int>, mshadow::gpu, int, 1, 1>, int>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ScalarExp<int>, int>) | 0 | 57.33 | 0.08 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 569 | 8123.11 | 11.06 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::BinaryMapExp<mxnet::op::mshadow_op::bool_and, mshadow::expr::BinaryMapExp<mxnet::op::mshadow_op::greater_than, mshadow::Tensor<mshadow::gpu, 1, float>, mshadow::expr::ScalarExp<float>, float, 1>, mshadow::expr::BinaryMapExp<mxnet::op::mshadow_op::not_equal, mshadow::Tensor<mshadow::gpu, 1, float>, mshadow::expr::ScalarExp<float>, float, 1>, float, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::BinaryMapExp<mxnet::op::mshadow_op::bool_and, mshadow::expr::BinaryMapExp<mxnet::op::mshadow_op::greater_than, mshadow::Tensor<mshadow::gpu, 1, float>, mshadow::expr::ScalarExp<float>, float, 1>, mshadow::expr::BinaryMapExp<mxnet::op::mshadow_op::not_equal, mshadow::Tensor<mshadow::gpu, 1, float>, mshadow::expr::ScalarExp<float>, float, 1>, float, 1>, float>) | 0 | 113.00 | 0.15 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::ReshapeExp<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 1>, float, 1, 3>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ReshapeExp<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 1>, float, 1, 3>, float>) | 1 | 837.71 | 1.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>) | 0 | 58.00 | 0.08 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::UnaryMapExp<mxnet::op::mshadow_op::identity, mshadow::Tensor<mshadow::gpu, 1, float>, float, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::UnaryMapExp<mxnet::op::mshadow_op::identity, mshadow::Tensor<mshadow::gpu, 1, float>, float, 1>, float>) | 0 | 113.33 | 0.15 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, int>, int>, mshadow::expr::Plan<mshadow::expr::BinaryMapExp<mshadow::op::div, mshadow::Tensor<mshadow::gpu, 1, int>, mshadow::expr::ScalarExp<int>, int, 1>, int> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, int>, int>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::BinaryMapExp<mshadow::op::div, mshadow::Tensor<mshadow::gpu, 1, int>, mshadow::expr::ScalarExp<int>, int, 1>, int>) | 1 | 7.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, int>, int>, mshadow::expr::Plan<mshadow::expr::RangeExp<int>, int> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, int>, int>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::RangeExp<int>, int>) | 0 | 91.00 | 0.12 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::ReduceWithAxisExp<mshadow::red::sum, mshadow::expr::UnaryMapExp<mxnet::op::mshadow_op::square, mshadow::Tensor<mshadow::gpu, 3, float>, float, 1>, float, 3, false, 2>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ReduceWithAxisExp<mshadow::red::sum, mshadow::expr::UnaryMapExp<mxnet::op::mshadow_op::square, mshadow::Tensor<mshadow::gpu, 3, float>, float, 1>, float, 3, false, 2>, float>) | 0 | 222.00 | 0.30 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 2, float>, mshadow::gpu, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 2, float>, mshadow::gpu, float, 2, 1>, float>) | 280 | 4201.69 | 5.72 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::UnaryMapExp<mxnet::op::mshadow_op::square_root, mshadow::Tensor<mshadow::gpu, 2, float>, float, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::UnaryMapExp<mxnet::op::mshadow_op::square_root, mshadow::Tensor<mshadow::gpu, 2, float>, float, 1>, float>) | 0 | 4.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>, mshadow::expr::Plan<mshadow::expr::BinaryMapExp<mshadow::op::div, mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::expr::MakeTensorExp<mshadow::expr::BroadcastWithAxisExp<mshadow::Tensor<mshadow::gpu, 2, float>, float, 2, 3>, mshadow::Tensor<mshadow::gpu, 2, float>, 3, float>, float, 3>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::BinaryMapExp<mshadow::op::div, mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::expr::MakeTensorExp<mshadow::expr::BroadcastWithAxisExp<mshadow::Tensor<mshadow::gpu, 2, float>, float, 2, 3>, mshadow::Tensor<mshadow::gpu, 2, float>, 3, float>, float, 3>, float>) | 0 | 238.00 | 0.32 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>) | 1 | 398.14 | 0.54 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>) | 9 | 52.77 | 0.07 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 4, float>, float>, mshadow::expr::Plan<mshadow::expr::TransposeExExp<mshadow::Tensor<mshadow::gpu, 4, float>, float, 4>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 4, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::TransposeExExp<mshadow::Tensor<mshadow::gpu, 4, float>, float, 4>, float>) | 18 | 347.67 | 0.47 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::compute_area, float*, float*, int*, int*, int, int, int, int>(int, float*, float*, int*, int*, int, int, int, int) | 0 | 6.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::div, 1>, float*, float*, float>(int, float*, float*, float) | 3 | 16.00 | 0.02 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::exp, 1>, float*, float*>(int, float*, float*) | 3 | 14.67 | 0.02 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::gt, 1>, float*, float*, float>(int, float*, float*, float) | 0 | 113.00 | 0.15 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::minus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1 | 9.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::mul, 1>, float*, float*, float>(int, float*, float*, float) | 20 | 595.53 | 0.81 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::mul, 1>, float*, float*, float*>(int, float*, float*, float*) | 6 | 22.57 | 0.03 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float>(int, float*, float*, float) | 132 | 402.33 | 0.55 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 5 | 23.33 | 0.03 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mxnet_op::set_to_int<0>, 1>, float*>(int, float*) | 3 | 115.33 | 0.16 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mxnet_op::set_to_int<1>, 1>, float*>(int, float*) | 1 | 116.00 | 0.16 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::nms_assign, float*, float*, float*, int*, int*, int, int, int>(int, float*, float*, float*, int*, int*, int, int, int) | 0 | 534.00 | 0.73 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::slice_forward<5, 1, mshadow::gpu>, float*, float*, mshadow::Shape<5>, mshadow::Shape<5>, mxnet::common::StaticArray<int, 5>, mxnet::common::StaticArray<int, 5> >(int, float*, float*, mshadow::Shape<5>, mshadow::Shape<5>, mxnet::common::StaticArray<int, 5>, mxnet::common::StaticArray<int, 5>) | 10 | 65.33 | 0.09 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::where<1>, float*, float*, float*, float*>(int, float*, float*, float*, float*) | 3 | 690.33 | 0.94 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::mxnet_generic_kernel_ex<mxnet::op::mxnet_op::binary_broadcast_kernel<2, float, mxnet::op::mshadow_op::mul>, mxnet::OpReqType, mshadow::Shape<2>, mshadow::Shape<2>, mshadow::Shape<2>, float*, float*, float*>(int, mxnet::OpReqType, mshadow::Shape<2>, mshadow::Shape<2>, mshadow::Shape<2>, float*, float*, float*) | 1 | 177.00 | 0.24 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::mxnet_op::softmax_compute_kernel<7, mxnet::op::mxnet_op::softmax_fwd, false, float, 2, float, float>(float*, float*, int, int, mshadow::Shape<2>, mshadow::Shape<2>, double) | 0 | 681.33 | 0.93 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void mxnet::op::nms_apply_kernel<float, 2, true>(int, int*, int const*, float const*, float const*, int, int, int, int, float, bool, int, int) | 0 | 298.33 | 0.41 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36 | 5150.00 | 7.01 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 22 | 4545.00 | 6.19 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::CopyIfAgent<float*, float*, float*, mxnet::op::valid_value<float>, int, int*>, float*, float*, float*, mxnet::op::valid_value<float>, int, int*, thrust::cuda_cub::cub::ScanTileState<int, true>, unsigned long>(float*, float*, float*, mxnet::op::valid_value<float>, int, int*, thrust::cuda_cub::cub::ScanTileState<int, true>, unsigned long) | 0 | 140.00 | 0.19 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::CopyIfAgent<int*, float*, int*, mxnet::op::valid_value<float>, int, int*>, int*, float*, int*, mxnet::op::valid_value<float>, int, int*, thrust::cuda_cub::cub::ScanTileState<int, true>, unsigned long>(int*, float*, int*, mxnet::op::valid_value<float>, int, int*, thrust::cuda_cub::cub::ScanTileState<int, true>, unsigned long) | 0 | 133.00 | 0.18 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__copy_if::InitAgent<thrust::cuda_cub::cub::ScanTileState<int, true>, int*, int>, thrust::cuda_cub::cub::ScanTileState<int, true>, unsigned long, int*>(thrust::cuda_cub::cub::ScanTileState<int, true>, unsigned long, int*) | 1 | 5.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::cuda_cub::__transform::unary_transform_f<float*, float*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<float>, thrust::cuda_cub::__transform::always_true_predicate>, long) | 0 | 4.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::core::_kernel_agent<thrust::cuda_cub::__parallel_for::ParallelForAgent<thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>, thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long>(thrust::cuda_cub::__transform::unary_transform_f<int*, int*, thrust::cuda_cub::__transform::no_stencil_tag, thrust::identity<int>, thrust::cuda_cub::__transform::always_true_predicate>, long) | 2 | 9.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, false, true, float, int, int>(float const*, float*, int const*, int*, int*, int, int, int, thrust::cuda_cub::cub::GridEvenShare<int>) | 1 | 87.00 | 0.12 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, true, true, float, int, int>(float const*, float*, int const*, int*, int*, int, int, int, thrust::cuda_cub::cub::GridEvenShare<int>) | 2 | 70.33 | 0.10 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<int, int, int>::Policy700, false, false, int, int, int>(int const*, int*, int const*, int*, int*, int, int, int, thrust::cuda_cub::cub::GridEvenShare<int>) | 1 | 19.00 | 0.03 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::cub::DeviceRadixSortDownsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<int, int, int>::Policy700, true, false, int, int, int>(int const*, int*, int const*, int*, int*, int, int, int, thrust::cuda_cub::cub::GridEvenShare<int>) | 2 | 24.56 | 0.03 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, false, true, float, int>(float const*, int*, int, int, int, thrust::cuda_cub::cub::GridEvenShare<int>) | 1 | 35.00 | 0.05 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, true, true, float, int>(float const*, int*, int, int, int, thrust::cuda_cub::cub::GridEvenShare<int>) | 2 | 36.89 | 0.05 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<int, int, int>::Policy700, false, false, int, int>(int const*, int*, int, int, int, thrust::cuda_cub::cub::GridEvenShare<int>) | 1 | 31.00 | 0.04 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::cub::DeviceRadixSortUpsweepKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<int, int, int>::Policy700, true, false, int, int>(int const*, int*, int, int, int, thrust::cuda_cub::cub::GridEvenShare<int>) | 2 | 33.89 | 0.05 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<float, int, int>::Policy700, int>(int*, int) | 4 | 55.46 | 0.08 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
void thrust::cuda_cub::cub::RadixSortScanBinsKernel<thrust::cuda_cub::cub::DeviceRadixSortPolicy<int, int, int>::Policy700, int>(int*, int) | 4 | 55.31 | 0.08 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
Showing 1 to 60 of 60 entries