GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | densenet0_conv0_fwd | Convolution | [1,3,224,224] | 18144.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
0 | densenet0_conv0_fwd | Convolution | [1,3,224,224] | 18144.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_medium_nn_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
1 | densenet0_batchnorm0_fwd | BatchNorm | [1,64,112,112] | 205.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
2 | densenet0_relu0_fwd | Activation | [1,64,112,112] | 142.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
3 | densenet0_pool0_fwd | Pooling | [1,64,112,112] | 2843.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
4 | densenet0_stage1_batchnorm0_fwd | BatchNorm | [1,64,56,56] | 116 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
5 | densenet0_stage1_relu0_fwd | Activation | [1,64,56,56] | 38 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
6 | densenet0_stage1_conv0_fwd | Convolution | [1,64,56,56] | 3866.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
6 | densenet0_stage1_conv0_fwd | Convolution | [1,64,56,56] | 3866.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
7 | densenet0_stage1_batchnorm1_fwd | BatchNorm | [1,128,56,56] | 126 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
8 | densenet0_stage1_relu1_fwd | Activation | [1,128,56,56] | 70.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
9 | densenet0_stage1_conv1_fwd | Convolution | [1,128,56,56] | 17473.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
9 | densenet0_stage1_conv1_fwd | Convolution | [1,128,56,56] | 17473.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
10 | densenet0_stage1_concat0 | Concat | [1,64,56,56] | 129 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
10 | densenet0_stage1_concat0 | Concat | [1,64,56,56] | 129 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
11 | densenet0_stage1_batchnorm2_fwd | BatchNorm | [1,96,56,56] | 111 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
12 | densenet0_stage1_relu2_fwd | Activation | [1,96,56,56] | 55.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
13 | densenet0_stage1_conv2_fwd | Convolution | [1,96,56,56] | 4555.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
13 | densenet0_stage1_conv2_fwd | Convolution | [1,96,56,56] | 4555.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
14 | densenet0_stage1_batchnorm3_fwd | BatchNorm | [1,128,56,56] | 122.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
15 | densenet0_stage1_relu3_fwd | Activation | [1,128,56,56] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
16 | densenet0_stage1_conv3_fwd | Convolution | [1,128,56,56] | 17159.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
16 | densenet0_stage1_conv3_fwd | Convolution | [1,128,56,56] | 17159.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
17 | densenet0_stage1_concat1 | Concat | [1,96,56,56] | 119.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
17 | densenet0_stage1_concat1 | Concat | [1,96,56,56] | 119.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
18 | densenet0_stage1_batchnorm4_fwd | BatchNorm | [1,128,56,56] | 122.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
19 | densenet0_stage1_relu4_fwd | Activation | [1,128,56,56] | 68.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
20 | densenet0_stage1_conv4_fwd | Convolution | [1,128,56,56] | 6580.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
20 | densenet0_stage1_conv4_fwd | Convolution | [1,128,56,56] | 6580.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
21 | densenet0_stage1_batchnorm5_fwd | BatchNorm | [1,128,56,56] | 135 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
22 | densenet0_stage1_relu5_fwd | Activation | [1,128,56,56] | 73.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
23 | densenet0_stage1_conv5_fwd | Convolution | [1,128,56,56] | 17256.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
23 | densenet0_stage1_conv5_fwd | Convolution | [1,128,56,56] | 17256.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
24 | densenet0_stage1_concat2 | Concat | [1,128,56,56] | 125.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
24 | densenet0_stage1_concat2 | Concat | [1,128,56,56] | 125.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
25 | densenet0_stage1_batchnorm6_fwd | BatchNorm | [1,160,56,56] | 120.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
26 | densenet0_stage1_relu6_fwd | Activation | [1,160,56,56] | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
27 | densenet0_stage1_conv6_fwd | Convolution | [1,160,56,56] | 7394.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
27 | densenet0_stage1_conv6_fwd | Convolution | [1,160,56,56] | 7394.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
28 | densenet0_stage1_batchnorm7_fwd | BatchNorm | [1,128,56,56] | 121 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
29 | densenet0_stage1_relu7_fwd | Activation | [1,128,56,56] | 74 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
30 | densenet0_stage1_conv7_fwd | Convolution | [1,128,56,56] | 17223.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
30 | densenet0_stage1_conv7_fwd | Convolution | [1,128,56,56] | 17223.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
31 | densenet0_stage1_concat3 | Concat | [1,160,56,56] | 134 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
31 | densenet0_stage1_concat3 | Concat | [1,160,56,56] | 134 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
32 | densenet0_stage1_batchnorm8_fwd | BatchNorm | [1,192,56,56] | 129 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
33 | densenet0_stage1_relu8_fwd | Activation | [1,192,56,56] | 113.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
34 | densenet0_stage1_conv8_fwd | Convolution | [1,192,56,56] | 8194.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
34 | densenet0_stage1_conv8_fwd | Convolution | [1,192,56,56] | 8194.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
35 | densenet0_stage1_batchnorm9_fwd | BatchNorm | [1,128,56,56] | 130 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
36 | densenet0_stage1_relu9_fwd | Activation | [1,128,56,56] | 78 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
37 | densenet0_stage1_conv9_fwd | Convolution | [1,128,56,56] | 17109.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
37 | densenet0_stage1_conv9_fwd | Convolution | [1,128,56,56] | 17109.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
38 | densenet0_stage1_concat4 | Concat | [1,192,56,56] | 142.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
38 | densenet0_stage1_concat4 | Concat | [1,192,56,56] | 142.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
39 | densenet0_stage1_batchnorm10_fwd | BatchNorm | [1,224,56,56] | 129.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
40 | densenet0_stage1_relu10_fwd | Activation | [1,224,56,56] | 109 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
41 | densenet0_stage1_conv10_fwd | Convolution | [1,224,56,56] | 9933.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
41 | densenet0_stage1_conv10_fwd | Convolution | [1,224,56,56] | 9933.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
42 | densenet0_stage1_batchnorm11_fwd | BatchNorm | [1,128,56,56] | 122.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
43 | densenet0_stage1_relu11_fwd | Activation | [1,128,56,56] | 69 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
44 | densenet0_stage1_conv11_fwd | Convolution | [1,128,56,56] | 17102.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
44 | densenet0_stage1_conv11_fwd | Convolution | [1,128,56,56] | 17102.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
45 | densenet0_stage1_concat5 | Concat | [1,224,56,56] | 159.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
45 | densenet0_stage1_concat5 | Concat | [1,224,56,56] | 159.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
46 | densenet0_batchnorm1_fwd | BatchNorm | [1,256,56,56] | 138.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
47 | densenet0_relu1_fwd | Activation | [1,256,56,56] | 130.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
48 | densenet0_conv1_fwd | Convolution | [1,256,56,56] | 11063 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
48 | densenet0_conv1_fwd | Convolution | [1,256,56,56] | 11063 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
49 | densenet0_pool1_fwd | Pooling | [1,128,56,56] | 939.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
50 | densenet0_stage2_batchnorm0_fwd | BatchNorm | [1,128,28,28] | 89.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
51 | densenet0_stage2_relu0_fwd | Activation | [1,128,28,28] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
52 | densenet0_stage2_conv0_fwd | Convolution | [1,128,28,28] | 1878.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
52 | densenet0_stage2_conv0_fwd | Convolution | [1,128,28,28] | 1878.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
53 | densenet0_stage2_batchnorm1_fwd | BatchNorm | [1,128,28,28] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
54 | densenet0_stage2_relu1_fwd | Activation | [1,128,28,28] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
55 | densenet0_stage2_conv1_fwd | Convolution | [1,128,28,28] | 5019.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
55 | densenet0_stage2_conv1_fwd | Convolution | [1,128,28,28] | 5019.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
56 | densenet0_stage2_concat0 | Concat | [1,128,28,28] | 90.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
56 | densenet0_stage2_concat0 | Concat | [1,128,28,28] | 90.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
57 | densenet0_stage2_batchnorm2_fwd | BatchNorm | [1,160,28,28] | 26.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
58 | densenet0_stage2_relu2_fwd | Activation | [1,160,28,28] | 25.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
59 | densenet0_stage2_conv2_fwd | Convolution | [1,160,28,28] | 2176 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
59 | densenet0_stage2_conv2_fwd | Convolution | [1,160,28,28] | 2176 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
60 | densenet0_stage2_batchnorm3_fwd | BatchNorm | [1,128,28,28] | 22.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
61 | densenet0_stage2_relu3_fwd | Activation | [1,128,28,28] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
62 | densenet0_stage2_conv3_fwd | Convolution | [1,128,28,28] | 4991.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
62 | densenet0_stage2_conv3_fwd | Convolution | [1,128,28,28] | 4991.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
63 | densenet0_stage2_concat1 | Concat | [1,160,28,28] | 80.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
63 | densenet0_stage2_concat1 | Concat | [1,160,28,28] | 80.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
64 | densenet0_stage2_batchnorm4_fwd | BatchNorm | [1,192,28,28] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
65 | densenet0_stage2_relu4_fwd | Activation | [1,192,28,28] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
66 | densenet0_stage2_conv4_fwd | Convolution | [1,192,28,28] | 2401.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
67 | densenet0_stage2_batchnorm5_fwd | BatchNorm | [1,128,28,28] | 69.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
68 | densenet0_stage2_relu5_fwd | Activation | [1,128,28,28] | 18.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
69 | densenet0_stage2_conv5_fwd | Convolution | [1,128,28,28] | 4802.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
69 | densenet0_stage2_conv5_fwd | Convolution | [1,128,28,28] | 4802.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
70 | densenet0_stage2_concat2 | Concat | [1,192,28,28] | 84 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
70 | densenet0_stage2_concat2 | Concat | [1,192,28,28] | 84 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
71 | densenet0_stage2_batchnorm6_fwd | BatchNorm | [1,224,28,28] | 25.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
72 | densenet0_stage2_relu6_fwd | Activation | [1,224,28,28] | 25.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
73 | densenet0_stage2_conv6_fwd | Convolution | [1,224,28,28] | 2969.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
74 | densenet0_stage2_batchnorm7_fwd | BatchNorm | [1,128,28,28] | 94 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
75 | densenet0_stage2_relu7_fwd | Activation | [1,128,28,28] | 19.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
76 | densenet0_stage2_conv7_fwd | Convolution | [1,128,28,28] | 4744.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
76 | densenet0_stage2_conv7_fwd | Convolution | [1,128,28,28] | 4744.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
77 | densenet0_stage2_concat3 | Concat | [1,224,28,28] | 96.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
77 | densenet0_stage2_concat3 | Concat | [1,224,28,28] | 96.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
78 | densenet0_stage2_batchnorm8_fwd | BatchNorm | [1,256,28,28] | 32 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
79 | densenet0_stage2_relu8_fwd | Activation | [1,256,28,28] | 39.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
80 | densenet0_stage2_conv8_fwd | Convolution | [1,256,28,28] | 3185 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
81 | densenet0_stage2_batchnorm9_fwd | BatchNorm | [1,128,28,28] | 98 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
82 | densenet0_stage2_relu9_fwd | Activation | [1,128,28,28] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
83 | densenet0_stage2_conv9_fwd | Convolution | [1,128,28,28] | 4671 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
83 | densenet0_stage2_conv9_fwd | Convolution | [1,128,28,28] | 4671 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
84 | densenet0_stage2_concat4 | Concat | [1,256,28,28] | 96.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
84 | densenet0_stage2_concat4 | Concat | [1,256,28,28] | 96.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
85 | densenet0_stage2_batchnorm10_fwd | BatchNorm | [1,288,28,28] | 34.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
86 | densenet0_stage2_relu10_fwd | Activation | [1,288,28,28] | 41.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
87 | densenet0_stage2_conv10_fwd | Convolution | [1,288,28,28] | 3416 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
88 | densenet0_stage2_batchnorm11_fwd | BatchNorm | [1,128,28,28] | 90.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
89 | densenet0_stage2_relu11_fwd | Activation | [1,128,28,28] | 26.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
90 | densenet0_stage2_conv11_fwd | Convolution | [1,128,28,28] | 4866.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
90 | densenet0_stage2_conv11_fwd | Convolution | [1,128,28,28] | 4866.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
91 | densenet0_stage2_concat5 | Concat | [1,288,28,28] | 94.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
91 | densenet0_stage2_concat5 | Concat | [1,288,28,28] | 94.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
92 | densenet0_stage2_batchnorm12_fwd | BatchNorm | [1,320,28,28] | 31 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
93 | densenet0_stage2_relu12_fwd | Activation | [1,320,28,28] | 37.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
94 | densenet0_stage2_conv12_fwd | Convolution | [1,320,28,28] | 3985 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
95 | densenet0_stage2_batchnorm13_fwd | BatchNorm | [1,128,28,28] | 91.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
96 | densenet0_stage2_relu13_fwd | Activation | [1,128,28,28] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
97 | densenet0_stage2_conv13_fwd | Convolution | [1,128,28,28] | 4864 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
97 | densenet0_stage2_conv13_fwd | Convolution | [1,128,28,28] | 4864 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
98 | densenet0_stage2_concat6 | Concat | [1,320,28,28] | 101.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
98 | densenet0_stage2_concat6 | Concat | [1,320,28,28] | 101.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
99 | densenet0_stage2_batchnorm14_fwd | BatchNorm | [1,352,28,28] | 40.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
100 | densenet0_stage2_relu14_fwd | Activation | [1,352,28,28] | 55 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
101 | densenet0_stage2_conv14_fwd | Convolution | [1,352,28,28] | 4233 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
102 | densenet0_stage2_batchnorm15_fwd | BatchNorm | [1,128,28,28] | 89.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
103 | densenet0_stage2_relu15_fwd | Activation | [1,128,28,28] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
104 | densenet0_stage2_conv15_fwd | Convolution | [1,128,28,28] | 4826 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
104 | densenet0_stage2_conv15_fwd | Convolution | [1,128,28,28] | 4826 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
105 | densenet0_stage2_concat7 | Concat | [1,352,28,28] | 96 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
105 | densenet0_stage2_concat7 | Concat | [1,352,28,28] | 96 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
106 | densenet0_stage2_batchnorm16_fwd | BatchNorm | [1,384,28,28] | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
107 | densenet0_stage2_relu16_fwd | Activation | [1,384,28,28] | 51.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
108 | densenet0_stage2_conv16_fwd | Convolution | [1,384,28,28] | 4409 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
109 | densenet0_stage2_batchnorm17_fwd | BatchNorm | [1,128,28,28] | 83.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
110 | densenet0_stage2_relu17_fwd | Activation | [1,128,28,28] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
111 | densenet0_stage2_conv17_fwd | Convolution | [1,128,28,28] | 4835 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
111 | densenet0_stage2_conv17_fwd | Convolution | [1,128,28,28] | 4835 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
112 | densenet0_stage2_concat8 | Concat | [1,384,28,28] | 103.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
112 | densenet0_stage2_concat8 | Concat | [1,384,28,28] | 103.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
113 | densenet0_stage2_batchnorm18_fwd | BatchNorm | [1,416,28,28] | 45.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
114 | densenet0_stage2_relu18_fwd | Activation | [1,416,28,28] | 57 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
115 | densenet0_stage2_conv18_fwd | Convolution | [1,416,28,28] | 5029 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
116 | densenet0_stage2_batchnorm19_fwd | BatchNorm | [1,128,28,28] | 83.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
117 | densenet0_stage2_relu19_fwd | Activation | [1,128,28,28] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
118 | densenet0_stage2_conv19_fwd | Convolution | [1,128,28,28] | 4773.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
118 | densenet0_stage2_conv19_fwd | Convolution | [1,128,28,28] | 4773.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
119 | densenet0_stage2_concat9 | Concat | [1,416,28,28] | 98.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
119 | densenet0_stage2_concat9 | Concat | [1,416,28,28] | 98.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
120 | densenet0_stage2_batchnorm20_fwd | BatchNorm | [1,448,28,28] | 43.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
121 | densenet0_stage2_relu20_fwd | Activation | [1,448,28,28] | 67 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
122 | densenet0_stage2_conv20_fwd | Convolution | [1,448,28,28] | 5272 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
123 | densenet0_stage2_batchnorm21_fwd | BatchNorm | [1,128,28,28] | 80.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
124 | densenet0_stage2_relu21_fwd | Activation | [1,128,28,28] | 22.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
125 | densenet0_stage2_conv21_fwd | Convolution | [1,128,28,28] | 4845.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
125 | densenet0_stage2_conv21_fwd | Convolution | [1,128,28,28] | 4845.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
126 | densenet0_stage2_concat10 | Concat | [1,448,28,28] | 103.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
126 | densenet0_stage2_concat10 | Concat | [1,448,28,28] | 103.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
127 | densenet0_stage2_batchnorm22_fwd | BatchNorm | [1,480,28,28] | 56.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
128 | densenet0_stage2_relu22_fwd | Activation | [1,480,28,28] | 59 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
129 | densenet0_stage2_conv22_fwd | Convolution | [1,480,28,28] | 5407 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
130 | densenet0_stage2_batchnorm23_fwd | BatchNorm | [1,128,28,28] | 85 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
131 | densenet0_stage2_relu23_fwd | Activation | [1,128,28,28] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
132 | densenet0_stage2_conv23_fwd | Convolution | [1,128,28,28] | 4857.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
132 | densenet0_stage2_conv23_fwd | Convolution | [1,128,28,28] | 4857.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
133 | densenet0_stage2_concat11 | Concat | [1,480,28,28] | 103.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
133 | densenet0_stage2_concat11 | Concat | [1,480,28,28] | 103.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
134 | densenet0_batchnorm2_fwd | BatchNorm | [1,512,28,28] | 46.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
135 | densenet0_relu2_fwd | Activation | [1,512,28,28] | 75.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
136 | densenet0_conv2_fwd | Convolution | [1,512,28,28] | 10596.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
137 | densenet0_pool2_fwd | Pooling | [1,256,28,28] | 505 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
138 | densenet0_stage3_batchnorm0_fwd | BatchNorm | [1,256,14,14] | 81.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
139 | densenet0_stage3_relu0_fwd | Activation | [1,256,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
140 | densenet0_stage3_conv0_fwd | Convolution | [1,256,14,14] | 876.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
141 | densenet0_stage3_batchnorm1_fwd | BatchNorm | [1,128,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
142 | densenet0_stage3_relu1_fwd | Activation | [1,128,14,14] | 7.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
143 | densenet0_stage3_conv1_fwd | Convolution | [1,128,14,14] | 1369.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
143 | densenet0_stage3_conv1_fwd | Convolution | [1,128,14,14] | 1369.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
144 | densenet0_stage3_concat0 | Concat | [1,256,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
144 | densenet0_stage3_concat0 | Concat | [1,256,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
145 | densenet0_stage3_batchnorm2_fwd | BatchNorm | [1,288,14,14] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
146 | densenet0_stage3_relu2_fwd | Activation | [1,288,14,14] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
147 | densenet0_stage3_conv2_fwd | Convolution | [1,288,14,14] | 938.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
148 | densenet0_stage3_batchnorm3_fwd | BatchNorm | [1,128,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
149 | densenet0_stage3_relu3_fwd | Activation | [1,128,14,14] | 7.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
150 | densenet0_stage3_conv3_fwd | Convolution | [1,128,14,14] | 1366.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
150 | densenet0_stage3_conv3_fwd | Convolution | [1,128,14,14] | 1366.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
151 | densenet0_stage3_concat1 | Concat | [1,288,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
151 | densenet0_stage3_concat1 | Concat | [1,288,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
152 | densenet0_stage3_batchnorm4_fwd | BatchNorm | [1,320,14,14] | 19.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
153 | densenet0_stage3_relu4_fwd | Activation | [1,320,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
154 | densenet0_stage3_conv4_fwd | Convolution | [1,320,14,14] | 1119.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
155 | densenet0_stage3_batchnorm5_fwd | BatchNorm | [1,128,14,14] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
156 | densenet0_stage3_relu5_fwd | Activation | [1,128,14,14] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
157 | densenet0_stage3_conv5_fwd | Convolution | [1,128,14,14] | 1369.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
157 | densenet0_stage3_conv5_fwd | Convolution | [1,128,14,14] | 1369.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
158 | densenet0_stage3_concat2 | Concat | [1,320,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
158 | densenet0_stage3_concat2 | Concat | [1,320,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
159 | densenet0_stage3_batchnorm6_fwd | BatchNorm | [1,352,14,14] | 22.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
160 | densenet0_stage3_relu6_fwd | Activation | [1,352,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
161 | densenet0_stage3_conv6_fwd | Convolution | [1,352,14,14] | 1193.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
162 | densenet0_stage3_batchnorm7_fwd | BatchNorm | [1,128,14,14] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
163 | densenet0_stage3_relu7_fwd | Activation | [1,128,14,14] | 9.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
164 | densenet0_stage3_conv7_fwd | Convolution | [1,128,14,14] | 1372.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
164 | densenet0_stage3_conv7_fwd | Convolution | [1,128,14,14] | 1372.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
165 | densenet0_stage3_concat3 | Concat | [1,352,14,14] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
165 | densenet0_stage3_concat3 | Concat | [1,352,14,14] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
166 | densenet0_stage3_batchnorm8_fwd | BatchNorm | [1,384,14,14] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
167 | densenet0_stage3_relu8_fwd | Activation | [1,384,14,14] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
168 | densenet0_stage3_conv8_fwd | Convolution | [1,384,14,14] | 1257.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
169 | densenet0_stage3_batchnorm9_fwd | BatchNorm | [1,128,14,14] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
170 | densenet0_stage3_relu9_fwd | Activation | [1,128,14,14] | 9.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
171 | densenet0_stage3_conv9_fwd | Convolution | [1,128,14,14] | 1370.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
171 | densenet0_stage3_conv9_fwd | Convolution | [1,128,14,14] | 1370.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
172 | densenet0_stage3_concat4 | Concat | [1,384,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
172 | densenet0_stage3_concat4 | Concat | [1,384,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
173 | densenet0_stage3_batchnorm10_fwd | BatchNorm | [1,416,14,14] | 22.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
174 | densenet0_stage3_relu10_fwd | Activation | [1,416,14,14] | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
175 | densenet0_stage3_conv10_fwd | Convolution | [1,416,14,14] | 1432.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
176 | densenet0_stage3_batchnorm11_fwd | BatchNorm | [1,128,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
177 | densenet0_stage3_relu11_fwd | Activation | [1,128,14,14] | 9.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
178 | densenet0_stage3_conv11_fwd | Convolution | [1,128,14,14] | 1371.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
178 | densenet0_stage3_conv11_fwd | Convolution | [1,128,14,14] | 1371.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
179 | densenet0_stage3_concat5 | Concat | [1,416,14,14] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
179 | densenet0_stage3_concat5 | Concat | [1,416,14,14] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
180 | densenet0_stage3_batchnorm12_fwd | BatchNorm | [1,448,14,14] | 23.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
181 | densenet0_stage3_relu12_fwd | Activation | [1,448,14,14] | 17.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
182 | densenet0_stage3_conv12_fwd | Convolution | [1,448,14,14] | 1503.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
183 | densenet0_stage3_batchnorm13_fwd | BatchNorm | [1,128,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
184 | densenet0_stage3_relu13_fwd | Activation | [1,128,14,14] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
185 | densenet0_stage3_conv13_fwd | Convolution | [1,128,14,14] | 1368.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
185 | densenet0_stage3_conv13_fwd | Convolution | [1,128,14,14] | 1368.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
186 | densenet0_stage3_concat6 | Concat | [1,448,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
186 | densenet0_stage3_concat6 | Concat | [1,448,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
187 | densenet0_stage3_batchnorm14_fwd | BatchNorm | [1,480,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
188 | densenet0_stage3_relu14_fwd | Activation | [1,480,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
189 | densenet0_stage3_conv14_fwd | Convolution | [1,480,14,14] | 1553.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
190 | densenet0_stage3_batchnorm15_fwd | BatchNorm | [1,128,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
191 | densenet0_stage3_relu15_fwd | Activation | [1,128,14,14] | 10.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
192 | densenet0_stage3_conv15_fwd | Convolution | [1,128,14,14] | 1372.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
192 | densenet0_stage3_conv15_fwd | Convolution | [1,128,14,14] | 1372.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
193 | densenet0_stage3_concat7 | Concat | [1,480,14,14] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
193 | densenet0_stage3_concat7 | Concat | [1,480,14,14] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
194 | densenet0_stage3_batchnorm16_fwd | BatchNorm | [1,512,14,14] | 18.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
195 | densenet0_stage3_relu16_fwd | Activation | [1,512,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
196 | densenet0_stage3_conv16_fwd | Convolution | [1,512,14,14] | 1726.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
197 | densenet0_stage3_batchnorm17_fwd | BatchNorm | [1,128,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
198 | densenet0_stage3_relu17_fwd | Activation | [1,128,14,14] | 8.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
199 | densenet0_stage3_conv17_fwd | Convolution | [1,128,14,14] | 1370.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
199 | densenet0_stage3_conv17_fwd | Convolution | [1,128,14,14] | 1370.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
200 | densenet0_stage3_concat8 | Concat | [1,512,14,14] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
200 | densenet0_stage3_concat8 | Concat | [1,512,14,14] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
201 | densenet0_stage3_batchnorm18_fwd | BatchNorm | [1,544,14,14] | 19.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
202 | densenet0_stage3_relu18_fwd | Activation | [1,544,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
203 | densenet0_stage3_conv18_fwd | Convolution | [1,544,14,14] | 1803.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
204 | densenet0_stage3_batchnorm19_fwd | BatchNorm | [1,128,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
205 | densenet0_stage3_relu19_fwd | Activation | [1,128,14,14] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
206 | densenet0_stage3_conv19_fwd | Convolution | [1,128,14,14] | 1370.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
206 | densenet0_stage3_conv19_fwd | Convolution | [1,128,14,14] | 1370.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
207 | densenet0_stage3_concat9 | Concat | [1,544,14,14] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
207 | densenet0_stage3_concat9 | Concat | [1,544,14,14] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
208 | densenet0_stage3_batchnorm20_fwd | BatchNorm | [1,576,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
209 | densenet0_stage3_relu20_fwd | Activation | [1,576,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
210 | densenet0_stage3_conv20_fwd | Convolution | [1,576,14,14] | 1899 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
211 | densenet0_stage3_batchnorm21_fwd | BatchNorm | [1,128,14,14] | 10.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
212 | densenet0_stage3_relu21_fwd | Activation | [1,128,14,14] | 9.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
213 | densenet0_stage3_conv21_fwd | Convolution | [1,128,14,14] | 1366 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
213 | densenet0_stage3_conv21_fwd | Convolution | [1,128,14,14] | 1366 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
214 | densenet0_stage3_concat10 | Concat | [1,576,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
214 | densenet0_stage3_concat10 | Concat | [1,576,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
215 | densenet0_stage3_batchnorm22_fwd | BatchNorm | [1,608,14,14] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
216 | densenet0_stage3_relu22_fwd | Activation | [1,608,14,14] | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
217 | densenet0_stage3_conv22_fwd | Convolution | [1,608,14,14] | 2068.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
218 | densenet0_stage3_batchnorm23_fwd | BatchNorm | [1,128,14,14] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
219 | densenet0_stage3_relu23_fwd | Activation | [1,128,14,14] | 9.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
220 | densenet0_stage3_conv23_fwd | Convolution | [1,128,14,14] | 1364 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
220 | densenet0_stage3_conv23_fwd | Convolution | [1,128,14,14] | 1364 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
221 | densenet0_stage3_concat11 | Concat | [1,608,14,14] | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
221 | densenet0_stage3_concat11 | Concat | [1,608,14,14] | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
222 | densenet0_stage3_batchnorm24_fwd | BatchNorm | [1,640,14,14] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
223 | densenet0_stage3_relu24_fwd | Activation | [1,640,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
224 | densenet0_stage3_conv24_fwd | Convolution | [1,640,14,14] | 2140.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
225 | densenet0_stage3_batchnorm25_fwd | BatchNorm | [1,128,14,14] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
226 | densenet0_stage3_relu25_fwd | Activation | [1,128,14,14] | 10.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
227 | densenet0_stage3_conv25_fwd | Convolution | [1,128,14,14] | 1370.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
227 | densenet0_stage3_conv25_fwd | Convolution | [1,128,14,14] | 1370.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
228 | densenet0_stage3_concat12 | Concat | [1,640,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
228 | densenet0_stage3_concat12 | Concat | [1,640,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
229 | densenet0_stage3_batchnorm26_fwd | BatchNorm | [1,672,14,14] | 22.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
230 | densenet0_stage3_relu26_fwd | Activation | [1,672,14,14] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
231 | densenet0_stage3_conv26_fwd | Convolution | [1,672,14,14] | 2205.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
232 | densenet0_stage3_batchnorm27_fwd | BatchNorm | [1,128,14,14] | 25.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
233 | densenet0_stage3_relu27_fwd | Activation | [1,128,14,14] | 9.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
234 | densenet0_stage3_conv27_fwd | Convolution | [1,128,14,14] | 1365.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
234 | densenet0_stage3_conv27_fwd | Convolution | [1,128,14,14] | 1365.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
235 | densenet0_stage3_concat13 | Concat | [1,672,14,14] | 19.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
235 | densenet0_stage3_concat13 | Concat | [1,672,14,14] | 19.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
236 | densenet0_stage3_batchnorm28_fwd | BatchNorm | [1,704,14,14] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
237 | densenet0_stage3_relu28_fwd | Activation | [1,704,14,14] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
238 | densenet0_stage3_conv28_fwd | Convolution | [1,704,14,14] | 2363.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
239 | densenet0_stage3_batchnorm29_fwd | BatchNorm | [1,128,14,14] | 69.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
240 | densenet0_stage3_relu29_fwd | Activation | [1,128,14,14] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
241 | densenet0_stage3_conv29_fwd | Convolution | [1,128,14,14] | 1366.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
241 | densenet0_stage3_conv29_fwd | Convolution | [1,128,14,14] | 1366.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
242 | densenet0_stage3_concat14 | Concat | [1,704,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
242 | densenet0_stage3_concat14 | Concat | [1,704,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
243 | densenet0_stage3_batchnorm30_fwd | BatchNorm | [1,736,14,14] | 23.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
244 | densenet0_stage3_relu30_fwd | Activation | [1,736,14,14] | 17.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
245 | densenet0_stage3_conv30_fwd | Convolution | [1,736,14,14] | 2427 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
246 | densenet0_stage3_batchnorm31_fwd | BatchNorm | [1,128,14,14] | 76.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
247 | densenet0_stage3_relu31_fwd | Activation | [1,128,14,14] | 6.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
248 | densenet0_stage3_conv31_fwd | Convolution | [1,128,14,14] | 1364.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
248 | densenet0_stage3_conv31_fwd | Convolution | [1,128,14,14] | 1364.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
249 | densenet0_stage3_concat15 | Concat | [1,736,14,14] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
249 | densenet0_stage3_concat15 | Concat | [1,736,14,14] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
250 | densenet0_stage3_batchnorm32_fwd | BatchNorm | [1,768,14,14] | 24.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
251 | densenet0_stage3_relu32_fwd | Activation | [1,768,14,14] | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
252 | densenet0_stage3_conv32_fwd | Convolution | [1,768,14,14] | 2484.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
253 | densenet0_stage3_batchnorm33_fwd | BatchNorm | [1,128,14,14] | 66.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
254 | densenet0_stage3_relu33_fwd | Activation | [1,128,14,14] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
255 | densenet0_stage3_conv33_fwd | Convolution | [1,128,14,14] | 1369 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
255 | densenet0_stage3_conv33_fwd | Convolution | [1,128,14,14] | 1369 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
256 | densenet0_stage3_concat16 | Concat | [1,768,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
256 | densenet0_stage3_concat16 | Concat | [1,768,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
257 | densenet0_stage3_batchnorm34_fwd | BatchNorm | [1,800,14,14] | 24.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
258 | densenet0_stage3_relu34_fwd | Activation | [1,800,14,14] | 18.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
259 | densenet0_stage3_conv34_fwd | Convolution | [1,800,14,14] | 2625 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
260 | densenet0_stage3_batchnorm35_fwd | BatchNorm | [1,128,14,14] | 68.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
261 | densenet0_stage3_relu35_fwd | Activation | [1,128,14,14] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
262 | densenet0_stage3_conv35_fwd | Convolution | [1,128,14,14] | 1373.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
262 | densenet0_stage3_conv35_fwd | Convolution | [1,128,14,14] | 1373.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
263 | densenet0_stage3_concat17 | Concat | [1,800,14,14] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
263 | densenet0_stage3_concat17 | Concat | [1,800,14,14] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
264 | densenet0_stage3_batchnorm36_fwd | BatchNorm | [1,832,14,14] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
265 | densenet0_stage3_relu36_fwd | Activation | [1,832,14,14] | 19.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
266 | densenet0_stage3_conv36_fwd | Convolution | [1,832,14,14] | 2687.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
267 | densenet0_stage3_batchnorm37_fwd | BatchNorm | [1,128,14,14] | 72 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
268 | densenet0_stage3_relu37_fwd | Activation | [1,128,14,14] | 9.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
269 | densenet0_stage3_conv37_fwd | Convolution | [1,128,14,14] | 1373.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
269 | densenet0_stage3_conv37_fwd | Convolution | [1,128,14,14] | 1373.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
270 | densenet0_stage3_concat18 | Concat | [1,832,14,14] | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
270 | densenet0_stage3_concat18 | Concat | [1,832,14,14] | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
271 | densenet0_stage3_batchnorm38_fwd | BatchNorm | [1,864,14,14] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
272 | densenet0_stage3_relu38_fwd | Activation | [1,864,14,14] | 19.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
273 | densenet0_stage3_conv38_fwd | Convolution | [1,864,14,14] | 2737.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
274 | densenet0_stage3_batchnorm39_fwd | BatchNorm | [1,128,14,14] | 71.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
275 | densenet0_stage3_relu39_fwd | Activation | [1,128,14,14] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
276 | densenet0_stage3_conv39_fwd | Convolution | [1,128,14,14] | 1374.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
276 | densenet0_stage3_conv39_fwd | Convolution | [1,128,14,14] | 1374.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
277 | densenet0_stage3_concat19 | Concat | [1,864,14,14] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
277 | densenet0_stage3_concat19 | Concat | [1,864,14,14] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
278 | densenet0_stage3_batchnorm40_fwd | BatchNorm | [1,896,14,14] | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
279 | densenet0_stage3_relu40_fwd | Activation | [1,896,14,14] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
280 | densenet0_stage3_conv40_fwd | Convolution | [1,896,14,14] | 2849 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
281 | densenet0_stage3_batchnorm41_fwd | BatchNorm | [1,128,14,14] | 83.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
282 | densenet0_stage3_relu41_fwd | Activation | [1,128,14,14] | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
283 | densenet0_stage3_conv41_fwd | Convolution | [1,128,14,14] | 1377 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
283 | densenet0_stage3_conv41_fwd | Convolution | [1,128,14,14] | 1377 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
284 | densenet0_stage3_concat20 | Concat | [1,896,14,14] | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
284 | densenet0_stage3_concat20 | Concat | [1,896,14,14] | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
285 | densenet0_stage3_batchnorm42_fwd | BatchNorm | [1,928,14,14] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
286 | densenet0_stage3_relu42_fwd | Activation | [1,928,14,14] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
287 | densenet0_stage3_conv42_fwd | Convolution | [1,928,14,14] | 2944.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
288 | densenet0_stage3_batchnorm43_fwd | BatchNorm | [1,128,14,14] | 73 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
289 | densenet0_stage3_relu43_fwd | Activation | [1,128,14,14] | 9.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
290 | densenet0_stage3_conv43_fwd | Convolution | [1,128,14,14] | 1375 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
290 | densenet0_stage3_conv43_fwd | Convolution | [1,128,14,14] | 1375 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
291 | densenet0_stage3_concat21 | Concat | [1,928,14,14] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
291 | densenet0_stage3_concat21 | Concat | [1,928,14,14] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
292 | densenet0_stage3_batchnorm44_fwd | BatchNorm | [1,960,14,14] | 27.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
293 | densenet0_stage3_relu44_fwd | Activation | [1,960,14,14] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
294 | densenet0_stage3_conv44_fwd | Convolution | [1,960,14,14] | 2980.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
295 | densenet0_stage3_batchnorm45_fwd | BatchNorm | [1,128,14,14] | 77.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
296 | densenet0_stage3_relu45_fwd | Activation | [1,128,14,14] | 6.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
297 | densenet0_stage3_conv45_fwd | Convolution | [1,128,14,14] | 1371.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
297 | densenet0_stage3_conv45_fwd | Convolution | [1,128,14,14] | 1371.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
298 | densenet0_stage3_concat22 | Concat | [1,960,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
298 | densenet0_stage3_concat22 | Concat | [1,960,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
299 | densenet0_stage3_batchnorm46_fwd | BatchNorm | [1,992,14,14] | 27.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
300 | densenet0_stage3_relu46_fwd | Activation | [1,992,14,14] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
301 | densenet0_stage3_conv46_fwd | Convolution | [1,992,14,14] | 3102.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
302 | densenet0_stage3_batchnorm47_fwd | BatchNorm | [1,128,14,14] | 88.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
303 | densenet0_stage3_relu47_fwd | Activation | [1,128,14,14] | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
304 | densenet0_stage3_conv47_fwd | Convolution | [1,128,14,14] | 1378 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
304 | densenet0_stage3_conv47_fwd | Convolution | [1,128,14,14] | 1378 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
305 | densenet0_stage3_concat23 | Concat | [1,992,14,14] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
305 | densenet0_stage3_concat23 | Concat | [1,992,14,14] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
306 | densenet0_batchnorm3_fwd | BatchNorm | [1,1024,14,14] | 29.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
307 | densenet0_relu3_fwd | Activation | [1,1024,14,14] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
308 | densenet0_conv3_fwd | Convolution | [1,1024,14,14] | 10374 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
309 | densenet0_pool3_fwd | Pooling | [1,512,14,14] | 251.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
310 | densenet0_stage4_batchnorm0_fwd | BatchNorm | [1,512,7,7] | 85.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
311 | densenet0_stage4_relu0_fwd | Activation | [1,512,7,7] | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
312 | densenet0_stage4_conv0_fwd | Convolution | [1,512,7,7] | 521.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
313 | densenet0_stage4_batchnorm1_fwd | BatchNorm | [1,128,7,7] | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
314 | densenet0_stage4_relu1_fwd | Activation | [1,128,7,7] | 4.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
315 | densenet0_stage4_conv1_fwd | Convolution | [1,128,7,7] | 390 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
315 | densenet0_stage4_conv1_fwd | Convolution | [1,128,7,7] | 390 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
316 | densenet0_stage4_concat0 | Concat | [1,512,7,7] | 10.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
316 | densenet0_stage4_concat0 | Concat | [1,512,7,7] | 10.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
317 | densenet0_stage4_batchnorm2_fwd | BatchNorm | [1,544,7,7] | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
318 | densenet0_stage4_relu2_fwd | Activation | [1,544,7,7] | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
319 | densenet0_stage4_conv2_fwd | Convolution | [1,544,7,7] | 541.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
320 | densenet0_stage4_batchnorm3_fwd | BatchNorm | [1,128,7,7] | 8.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
321 | densenet0_stage4_relu3_fwd | Activation | [1,128,7,7] | 4.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
322 | densenet0_stage4_conv3_fwd | Convolution | [1,128,7,7] | 382 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
322 | densenet0_stage4_conv3_fwd | Convolution | [1,128,7,7] | 382 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
323 | densenet0_stage4_concat1 | Concat | [1,544,7,7] | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
323 | densenet0_stage4_concat1 | Concat | [1,544,7,7] | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
324 | densenet0_stage4_batchnorm4_fwd | BatchNorm | [1,576,7,7] | 10.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
325 | densenet0_stage4_relu4_fwd | Activation | [1,576,7,7] | 6.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
326 | densenet0_stage4_conv4_fwd | Convolution | [1,576,7,7] | 571.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
327 | densenet0_stage4_batchnorm5_fwd | BatchNorm | [1,128,7,7] | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
328 | densenet0_stage4_relu5_fwd | Activation | [1,128,7,7] | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
329 | densenet0_stage4_conv5_fwd | Convolution | [1,128,7,7] | 382.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
329 | densenet0_stage4_conv5_fwd | Convolution | [1,128,7,7] | 382.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
330 | densenet0_stage4_concat2 | Concat | [1,576,7,7] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
330 | densenet0_stage4_concat2 | Concat | [1,576,7,7] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
331 | densenet0_stage4_batchnorm6_fwd | BatchNorm | [1,608,7,7] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
332 | densenet0_stage4_relu6_fwd | Activation | [1,608,7,7] | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
333 | densenet0_stage4_conv6_fwd | Convolution | [1,608,7,7] | 616 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
334 | densenet0_stage4_batchnorm7_fwd | BatchNorm | [1,128,7,7] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
335 | densenet0_stage4_relu7_fwd | Activation | [1,128,7,7] | 4.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
336 | densenet0_stage4_conv7_fwd | Convolution | [1,128,7,7] | 383 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
336 | densenet0_stage4_conv7_fwd | Convolution | [1,128,7,7] | 383 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
337 | densenet0_stage4_concat3 | Concat | [1,608,7,7] | 10.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
337 | densenet0_stage4_concat3 | Concat | [1,608,7,7] | 10.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
338 | densenet0_stage4_batchnorm8_fwd | BatchNorm | [1,640,7,7] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
339 | densenet0_stage4_relu8_fwd | Activation | [1,640,7,7] | 7.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
340 | densenet0_stage4_conv8_fwd | Convolution | [1,640,7,7] | 644.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
341 | densenet0_stage4_batchnorm9_fwd | BatchNorm | [1,128,7,7] | 8.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
342 | densenet0_stage4_relu9_fwd | Activation | [1,128,7,7] | 4.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
343 | densenet0_stage4_conv9_fwd | Convolution | [1,128,7,7] | 386.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
343 | densenet0_stage4_conv9_fwd | Convolution | [1,128,7,7] | 386.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
344 | densenet0_stage4_concat4 | Concat | [1,640,7,7] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
344 | densenet0_stage4_concat4 | Concat | [1,640,7,7] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
345 | densenet0_stage4_batchnorm10_fwd | BatchNorm | [1,672,7,7] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
346 | densenet0_stage4_relu10_fwd | Activation | [1,672,7,7] | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
347 | densenet0_stage4_conv10_fwd | Convolution | [1,672,7,7] | 663.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
348 | densenet0_stage4_batchnorm11_fwd | BatchNorm | [1,128,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
349 | densenet0_stage4_relu11_fwd | Activation | [1,128,7,7] | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
350 | densenet0_stage4_conv11_fwd | Convolution | [1,128,7,7] | 382.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
350 | densenet0_stage4_conv11_fwd | Convolution | [1,128,7,7] | 382.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
351 | densenet0_stage4_concat5 | Concat | [1,672,7,7] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
351 | densenet0_stage4_concat5 | Concat | [1,672,7,7] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
352 | densenet0_stage4_batchnorm12_fwd | BatchNorm | [1,704,7,7] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
353 | densenet0_stage4_relu12_fwd | Activation | [1,704,7,7] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
354 | densenet0_stage4_conv12_fwd | Convolution | [1,704,7,7] | 708 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
355 | densenet0_stage4_batchnorm13_fwd | BatchNorm | [1,128,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
356 | densenet0_stage4_relu13_fwd | Activation | [1,128,7,7] | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
357 | densenet0_stage4_conv13_fwd | Convolution | [1,128,7,7] | 384 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
357 | densenet0_stage4_conv13_fwd | Convolution | [1,128,7,7] | 384 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
358 | densenet0_stage4_concat6 | Concat | [1,704,7,7] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
358 | densenet0_stage4_concat6 | Concat | [1,704,7,7] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
359 | densenet0_stage4_batchnorm14_fwd | BatchNorm | [1,736,7,7] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
360 | densenet0_stage4_relu14_fwd | Activation | [1,736,7,7] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
361 | densenet0_stage4_conv14_fwd | Convolution | [1,736,7,7] | 733.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
362 | densenet0_stage4_batchnorm15_fwd | BatchNorm | [1,128,7,7] | 8.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
363 | densenet0_stage4_relu15_fwd | Activation | [1,128,7,7] | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
364 | densenet0_stage4_conv15_fwd | Convolution | [1,128,7,7] | 383.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
364 | densenet0_stage4_conv15_fwd | Convolution | [1,128,7,7] | 383.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
365 | densenet0_stage4_concat7 | Concat | [1,736,7,7] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
365 | densenet0_stage4_concat7 | Concat | [1,736,7,7] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
366 | densenet0_stage4_batchnorm16_fwd | BatchNorm | [1,768,7,7] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
367 | densenet0_stage4_relu16_fwd | Activation | [1,768,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
368 | densenet0_stage4_conv16_fwd | Convolution | [1,768,7,7] | 760.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
369 | densenet0_stage4_batchnorm17_fwd | BatchNorm | [1,128,7,7] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
370 | densenet0_stage4_relu17_fwd | Activation | [1,128,7,7] | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
371 | densenet0_stage4_conv17_fwd | Convolution | [1,128,7,7] | 383.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
371 | densenet0_stage4_conv17_fwd | Convolution | [1,128,7,7] | 383.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
372 | densenet0_stage4_concat8 | Concat | [1,768,7,7] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
372 | densenet0_stage4_concat8 | Concat | [1,768,7,7] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
373 | densenet0_stage4_batchnorm18_fwd | BatchNorm | [1,800,7,7] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
374 | densenet0_stage4_relu18_fwd | Activation | [1,800,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
375 | densenet0_stage4_conv18_fwd | Convolution | [1,800,7,7] | 800.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
376 | densenet0_stage4_batchnorm19_fwd | BatchNorm | [1,128,7,7] | 8.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
377 | densenet0_stage4_relu19_fwd | Activation | [1,128,7,7] | 4.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
378 | densenet0_stage4_conv19_fwd | Convolution | [1,128,7,7] | 383.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
378 | densenet0_stage4_conv19_fwd | Convolution | [1,128,7,7] | 383.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
379 | densenet0_stage4_concat9 | Concat | [1,800,7,7] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
379 | densenet0_stage4_concat9 | Concat | [1,800,7,7] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
380 | densenet0_stage4_batchnorm20_fwd | BatchNorm | [1,832,7,7] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
381 | densenet0_stage4_relu20_fwd | Activation | [1,832,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
382 | densenet0_stage4_conv20_fwd | Convolution | [1,832,7,7] | 825.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
383 | densenet0_stage4_batchnorm21_fwd | BatchNorm | [1,128,7,7] | 8.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
384 | densenet0_stage4_relu21_fwd | Activation | [1,128,7,7] | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
385 | densenet0_stage4_conv21_fwd | Convolution | [1,128,7,7] | 383.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
385 | densenet0_stage4_conv21_fwd | Convolution | [1,128,7,7] | 383.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
386 | densenet0_stage4_concat10 | Concat | [1,832,7,7] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
386 | densenet0_stage4_concat10 | Concat | [1,832,7,7] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
387 | densenet0_stage4_batchnorm22_fwd | BatchNorm | [1,864,7,7] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
388 | densenet0_stage4_relu22_fwd | Activation | [1,864,7,7] | 9.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
389 | densenet0_stage4_conv22_fwd | Convolution | [1,864,7,7] | 849.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
390 | densenet0_stage4_batchnorm23_fwd | BatchNorm | [1,128,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
391 | densenet0_stage4_relu23_fwd | Activation | [1,128,7,7] | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
392 | densenet0_stage4_conv23_fwd | Convolution | [1,128,7,7] | 381.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
392 | densenet0_stage4_conv23_fwd | Convolution | [1,128,7,7] | 381.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
393 | densenet0_stage4_concat11 | Concat | [1,864,7,7] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
393 | densenet0_stage4_concat11 | Concat | [1,864,7,7] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
394 | densenet0_stage4_batchnorm24_fwd | BatchNorm | [1,896,7,7] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
395 | densenet0_stage4_relu24_fwd | Activation | [1,896,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
396 | densenet0_stage4_conv24_fwd | Convolution | [1,896,7,7] | 890.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
397 | densenet0_stage4_batchnorm25_fwd | BatchNorm | [1,128,7,7] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
398 | densenet0_stage4_relu25_fwd | Activation | [1,128,7,7] | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
399 | densenet0_stage4_conv25_fwd | Convolution | [1,128,7,7] | 383.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
399 | densenet0_stage4_conv25_fwd | Convolution | [1,128,7,7] | 383.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
400 | densenet0_stage4_concat12 | Concat | [1,896,7,7] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
400 | densenet0_stage4_concat12 | Concat | [1,896,7,7] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
401 | densenet0_stage4_batchnorm26_fwd | BatchNorm | [1,928,7,7] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
402 | densenet0_stage4_relu26_fwd | Activation | [1,928,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
403 | densenet0_stage4_conv26_fwd | Convolution | [1,928,7,7] | 916.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
404 | densenet0_stage4_batchnorm27_fwd | BatchNorm | [1,128,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
405 | densenet0_stage4_relu27_fwd | Activation | [1,128,7,7] | 4.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
406 | densenet0_stage4_conv27_fwd | Convolution | [1,128,7,7] | 385 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
406 | densenet0_stage4_conv27_fwd | Convolution | [1,128,7,7] | 385 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
407 | densenet0_stage4_concat13 | Concat | [1,928,7,7] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
407 | densenet0_stage4_concat13 | Concat | [1,928,7,7] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
408 | densenet0_stage4_batchnorm28_fwd | BatchNorm | [1,960,7,7] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
409 | densenet0_stage4_relu28_fwd | Activation | [1,960,7,7] | 9.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
410 | densenet0_stage4_conv28_fwd | Convolution | [1,960,7,7] | 940.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
411 | densenet0_stage4_batchnorm29_fwd | BatchNorm | [1,128,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
412 | densenet0_stage4_relu29_fwd | Activation | [1,128,7,7] | 4.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
413 | densenet0_stage4_conv29_fwd | Convolution | [1,128,7,7] | 384.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
413 | densenet0_stage4_conv29_fwd | Convolution | [1,128,7,7] | 384.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
414 | densenet0_stage4_concat14 | Concat | [1,960,7,7] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
414 | densenet0_stage4_concat14 | Concat | [1,960,7,7] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
415 | densenet0_stage4_batchnorm30_fwd | BatchNorm | [1,992,7,7] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
416 | densenet0_stage4_relu30_fwd | Activation | [1,992,7,7] | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
417 | densenet0_stage4_conv30_fwd | Convolution | [1,992,7,7] | 980 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
418 | densenet0_stage4_batchnorm31_fwd | BatchNorm | [1,128,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
419 | densenet0_stage4_relu31_fwd | Activation | [1,128,7,7] | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
420 | densenet0_stage4_conv31_fwd | Convolution | [1,128,7,7] | 383 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
420 | densenet0_stage4_conv31_fwd | Convolution | [1,128,7,7] | 383 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
421 | densenet0_stage4_concat15 | Concat | [1,992,7,7] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
421 | densenet0_stage4_concat15 | Concat | [1,992,7,7] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
422 | densenet0_batchnorm4_fwd | BatchNorm | [1,1024,7,7] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
423 | densenet0_relu4_fwd | Activation | [1,1024,7,7] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
424 | densenet0_pool4_fwd | Pooling | [1,1024,7,7] | 77 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
426 | densenet0_dense0_fwd | FullyConnected | [1,1024] | 985.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void gemv2T_kernel_val<int, int, float, float, float, 128, 16, 4, 4, false, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>, float, float) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true | |
426 | densenet0_dense0_fwd | FullyConnected | [1,1024] | 985.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 0.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | NaN | true |
Showing 1 to 553 of 553 entries