GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | densenet1_conv0_fwd | Convolution | [1,3,224,224] | 24619.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x32_relu_medium_nn | 179.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
0 | densenet1_conv0_fwd | Convolution | [1,3,224,224] | 24619.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
1 | densenet1_batchnorm0_fwd | BatchNorm | [1,96,112,112] | 257 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 83.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
2 | densenet1_relu0_fwd | Activation | [1,96,112,112] | 77.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 71.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
3 | densenet1_pool0_fwd | Pooling | [1,96,112,112] | 4089.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 73.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
4 | densenet1_stage1_batchnorm0_fwd | BatchNorm | [1,96,56,56] | 143.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 20.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
5 | densenet1_stage1_relu0_fwd | Activation | [1,96,56,56] | 27.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
6 | densenet1_stage1_conv0_fwd | Convolution | [1,96,56,56] | 6995 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 59.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
6 | densenet1_stage1_conv0_fwd | Convolution | [1,96,56,56] | 6995 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
7 | densenet1_stage1_batchnorm1_fwd | BatchNorm | [1,192,56,56] | 134.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 38.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
8 | densenet1_stage1_relu1_fwd | Activation | [1,192,56,56] | 42.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
9 | densenet1_stage1_conv1_fwd | Convolution | [1,192,56,56] | 40543.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 134.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
9 | densenet1_stage1_conv1_fwd | Convolution | [1,192,56,56] | 40543.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 14.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
10 | densenet1_stage1_concat0 | Concat | [1,96,56,56] | 153.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 18.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
10 | densenet1_stage1_concat0 | Concat | [1,96,56,56] | 153.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 17.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
11 | densenet1_stage1_batchnorm2_fwd | BatchNorm | [1,144,56,56] | 98.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 29.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
12 | densenet1_stage1_relu2_fwd | Activation | [1,144,56,56] | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 22.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
13 | densenet1_stage1_conv2_fwd | Convolution | [1,144,56,56] | 10038.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 79.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
13 | densenet1_stage1_conv2_fwd | Convolution | [1,144,56,56] | 10038.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
14 | densenet1_stage1_batchnorm3_fwd | BatchNorm | [1,192,56,56] | 127.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 37.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
15 | densenet1_stage1_relu3_fwd | Activation | [1,192,56,56] | 43 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
16 | densenet1_stage1_conv3_fwd | Convolution | [1,192,56,56] | 40250 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 133.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
16 | densenet1_stage1_conv3_fwd | Convolution | [1,192,56,56] | 40250 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
17 | densenet1_stage1_concat1 | Concat | [1,144,56,56] | 152.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 24.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
17 | densenet1_stage1_concat1 | Concat | [1,144,56,56] | 152.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 21.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
18 | densenet1_stage1_batchnorm4_fwd | BatchNorm | [1,192,56,56] | 118.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 39.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
19 | densenet1_stage1_relu4_fwd | Activation | [1,192,56,56] | 44 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
20 | densenet1_stage1_conv4_fwd | Convolution | [1,192,56,56] | 13384.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 96.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
20 | densenet1_stage1_conv4_fwd | Convolution | [1,192,56,56] | 13384.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
21 | densenet1_stage1_batchnorm5_fwd | BatchNorm | [1,192,56,56] | 135 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 37.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
22 | densenet1_stage1_relu5_fwd | Activation | [1,192,56,56] | 43.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
23 | densenet1_stage1_conv5_fwd | Convolution | [1,192,56,56] | 40075.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 133.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
23 | densenet1_stage1_conv5_fwd | Convolution | [1,192,56,56] | 40075.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
24 | densenet1_stage1_concat2 | Concat | [1,192,56,56] | 159.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 30.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
24 | densenet1_stage1_concat2 | Concat | [1,192,56,56] | 159.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 25.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
25 | densenet1_stage1_batchnorm6_fwd | BatchNorm | [1,240,56,56] | 143.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 48.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
26 | densenet1_stage1_relu6_fwd | Activation | [1,240,56,56] | 56.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 45.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
27 | densenet1_stage1_conv6_fwd | Convolution | [1,240,56,56] | 16758 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 116.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
27 | densenet1_stage1_conv6_fwd | Convolution | [1,240,56,56] | 16758 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
28 | densenet1_stage1_batchnorm7_fwd | BatchNorm | [1,192,56,56] | 137 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
29 | densenet1_stage1_relu7_fwd | Activation | [1,192,56,56] | 43.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
30 | densenet1_stage1_conv7_fwd | Convolution | [1,192,56,56] | 40198 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 134.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
30 | densenet1_stage1_conv7_fwd | Convolution | [1,192,56,56] | 40198 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
31 | densenet1_stage1_concat3 | Concat | [1,240,56,56] | 170.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 36.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
31 | densenet1_stage1_concat3 | Concat | [1,240,56,56] | 170.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 30.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
32 | densenet1_stage1_batchnorm8_fwd | BatchNorm | [1,288,56,56] | 145.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
33 | densenet1_stage1_relu8_fwd | Activation | [1,288,56,56] | 60.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 53.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
34 | densenet1_stage1_conv8_fwd | Convolution | [1,288,56,56] | 20088.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 134.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
34 | densenet1_stage1_conv8_fwd | Convolution | [1,288,56,56] | 20088.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
35 | densenet1_stage1_batchnorm9_fwd | BatchNorm | [1,192,56,56] | 140 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
36 | densenet1_stage1_relu9_fwd | Activation | [1,192,56,56] | 43 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
37 | densenet1_stage1_conv9_fwd | Convolution | [1,192,56,56] | 40262 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 133.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
37 | densenet1_stage1_conv9_fwd | Convolution | [1,192,56,56] | 40262 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
38 | densenet1_stage1_concat4 | Concat | [1,288,56,56] | 183 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 41.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
38 | densenet1_stage1_concat4 | Concat | [1,288,56,56] | 183 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 34.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
39 | densenet1_stage1_batchnorm10_fwd | BatchNorm | [1,336,56,56] | 158 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 66.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
40 | densenet1_stage1_relu10_fwd | Activation | [1,336,56,56] | 72.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 62.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
41 | densenet1_stage1_conv10_fwd | Convolution | [1,336,56,56] | 23458 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 153.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
41 | densenet1_stage1_conv10_fwd | Convolution | [1,336,56,56] | 23458 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
42 | densenet1_stage1_batchnorm11_fwd | BatchNorm | [1,192,56,56] | 137 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
43 | densenet1_stage1_relu11_fwd | Activation | [1,192,56,56] | 42.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
44 | densenet1_stage1_conv11_fwd | Convolution | [1,192,56,56] | 40222.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 135.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
44 | densenet1_stage1_conv11_fwd | Convolution | [1,192,56,56] | 40222.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
45 | densenet1_stage1_concat5 | Concat | [1,336,56,56] | 192 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 47.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
45 | densenet1_stage1_concat5 | Concat | [1,336,56,56] | 192 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 38.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
46 | densenet1_batchnorm1_fwd | BatchNorm | [1,384,56,56] | 171.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 73.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
47 | densenet1_relu1_fwd | Activation | [1,384,56,56] | 76.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 71.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
48 | densenet1_conv1_fwd | Convolution | [1,384,56,56] | 26844.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 172.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
48 | densenet1_conv1_fwd | Convolution | [1,384,56,56] | 26844.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
49 | densenet1_pool1_fwd | Pooling | [1,192,56,56] | 1732 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 31.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
50 | densenet1_stage2_batchnorm0_fwd | BatchNorm | [1,192,28,28] | 103 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
51 | densenet1_stage2_relu0_fwd | Activation | [1,192,28,28] | 19.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
52 | densenet1_stage2_conv0_fwd | Convolution | [1,192,28,28] | 3391.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 64.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
52 | densenet1_stage2_conv0_fwd | Convolution | [1,192,28,28] | 3391.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
53 | densenet1_stage2_batchnorm1_fwd | BatchNorm | [1,192,28,28] | 82.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
54 | densenet1_stage2_relu1_fwd | Activation | [1,192,28,28] | 19.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
55 | densenet1_stage2_conv1_fwd | Convolution | [1,192,28,28] | 10000.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
55 | densenet1_stage2_conv1_fwd | Convolution | [1,192,28,28] | 10000.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
56 | densenet1_stage2_concat0 | Concat | [1,192,28,28] | 106 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
56 | densenet1_stage2_concat0 | Concat | [1,192,28,28] | 106 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
57 | densenet1_stage2_batchnorm2_fwd | BatchNorm | [1,240,28,28] | 42.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
58 | densenet1_stage2_relu2_fwd | Activation | [1,240,28,28] | 22.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
59 | densenet1_stage2_conv2_fwd | Convolution | [1,240,28,28] | 4216.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 81.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
59 | densenet1_stage2_conv2_fwd | Convolution | [1,240,28,28] | 4216.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
60 | densenet1_stage2_batchnorm3_fwd | BatchNorm | [1,192,28,28] | 84.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
61 | densenet1_stage2_relu3_fwd | Activation | [1,192,28,28] | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
62 | densenet1_stage2_conv3_fwd | Convolution | [1,192,28,28] | 9915.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 58.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
62 | densenet1_stage2_conv3_fwd | Convolution | [1,192,28,28] | 9915.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
63 | densenet1_stage2_concat1 | Concat | [1,240,28,28] | 104 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 11.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
63 | densenet1_stage2_concat1 | Concat | [1,240,28,28] | 104 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
64 | densenet1_stage2_batchnorm4_fwd | BatchNorm | [1,288,28,28] | 42.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
65 | densenet1_stage2_relu4_fwd | Activation | [1,288,28,28] | 22.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
66 | densenet1_stage2_conv4_fwd | Convolution | [1,288,28,28] | 5065.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 94.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
66 | densenet1_stage2_conv4_fwd | Convolution | [1,288,28,28] | 5065.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
67 | densenet1_stage2_batchnorm5_fwd | BatchNorm | [1,192,28,28] | 91 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
68 | densenet1_stage2_relu5_fwd | Activation | [1,192,28,28] | 18.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
69 | densenet1_stage2_conv5_fwd | Convolution | [1,192,28,28] | 9936.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 58.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
69 | densenet1_stage2_conv5_fwd | Convolution | [1,192,28,28] | 9936.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
70 | densenet1_stage2_concat2 | Concat | [1,288,28,28] | 106 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
70 | densenet1_stage2_concat2 | Concat | [1,288,28,28] | 106 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 11.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
71 | densenet1_stage2_batchnorm6_fwd | BatchNorm | [1,336,28,28] | 45.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 14.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
72 | densenet1_stage2_relu6_fwd | Activation | [1,336,28,28] | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
73 | densenet1_stage2_conv6_fwd | Convolution | [1,336,28,28] | 5902 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 107.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
73 | densenet1_stage2_conv6_fwd | Convolution | [1,336,28,28] | 5902 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
74 | densenet1_stage2_batchnorm7_fwd | BatchNorm | [1,192,28,28] | 93 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
75 | densenet1_stage2_relu7_fwd | Activation | [1,192,28,28] | 19.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
76 | densenet1_stage2_conv7_fwd | Convolution | [1,192,28,28] | 9912 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 58.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
76 | densenet1_stage2_conv7_fwd | Convolution | [1,192,28,28] | 9912 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
77 | densenet1_stage2_concat3 | Concat | [1,336,28,28] | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 14.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
77 | densenet1_stage2_concat3 | Concat | [1,336,28,28] | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 12.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
78 | densenet1_stage2_batchnorm8_fwd | BatchNorm | [1,384,28,28] | 52.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 19.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
79 | densenet1_stage2_relu8_fwd | Activation | [1,384,28,28] | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
80 | densenet1_stage2_conv8_fwd | Convolution | [1,384,28,28] | 6754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 120.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
80 | densenet1_stage2_conv8_fwd | Convolution | [1,384,28,28] | 6754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
81 | densenet1_stage2_batchnorm9_fwd | BatchNorm | [1,192,28,28] | 95 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
82 | densenet1_stage2_relu9_fwd | Activation | [1,192,28,28] | 18.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
83 | densenet1_stage2_conv9_fwd | Convolution | [1,192,28,28] | 9910.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 58.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
83 | densenet1_stage2_conv9_fwd | Convolution | [1,192,28,28] | 9910.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
84 | densenet1_stage2_concat4 | Concat | [1,384,28,28] | 113 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 16.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
84 | densenet1_stage2_concat4 | Concat | [1,384,28,28] | 113 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 13.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
85 | densenet1_stage2_batchnorm10_fwd | BatchNorm | [1,432,28,28] | 56 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
86 | densenet1_stage2_relu10_fwd | Activation | [1,432,28,28] | 30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
87 | densenet1_stage2_conv10_fwd | Convolution | [1,432,28,28] | 7591 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 132.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
87 | densenet1_stage2_conv10_fwd | Convolution | [1,432,28,28] | 7591 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
88 | densenet1_stage2_batchnorm11_fwd | BatchNorm | [1,192,28,28] | 95.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
89 | densenet1_stage2_relu11_fwd | Activation | [1,192,28,28] | 18.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
90 | densenet1_stage2_conv11_fwd | Convolution | [1,192,28,28] | 9919.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 58.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
90 | densenet1_stage2_conv11_fwd | Convolution | [1,192,28,28] | 9919.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
91 | densenet1_stage2_concat5 | Concat | [1,432,28,28] | 112.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 17.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
91 | densenet1_stage2_concat5 | Concat | [1,432,28,28] | 112.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 14.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
92 | densenet1_stage2_batchnorm12_fwd | BatchNorm | [1,480,28,28] | 56 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 26.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
93 | densenet1_stage2_relu12_fwd | Activation | [1,480,28,28] | 31.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 15.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
94 | densenet1_stage2_conv12_fwd | Convolution | [1,480,28,28] | 8438 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 145.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
94 | densenet1_stage2_conv12_fwd | Convolution | [1,480,28,28] | 8438 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
95 | densenet1_stage2_batchnorm13_fwd | BatchNorm | [1,192,28,28] | 97.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
96 | densenet1_stage2_relu13_fwd | Activation | [1,192,28,28] | 18.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
97 | densenet1_stage2_conv13_fwd | Convolution | [1,192,28,28] | 9907.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 58.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
97 | densenet1_stage2_conv13_fwd | Convolution | [1,192,28,28] | 9907.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
98 | densenet1_stage2_concat6 | Concat | [1,480,28,28] | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 19.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
98 | densenet1_stage2_concat6 | Concat | [1,480,28,28] | 108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 15.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
99 | densenet1_stage2_batchnorm14_fwd | BatchNorm | [1,528,28,28] | 64 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 29.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
100 | densenet1_stage2_relu14_fwd | Activation | [1,528,28,28] | 34.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 19.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
101 | densenet1_stage2_conv14_fwd | Convolution | [1,528,28,28] | 9296 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 158.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
101 | densenet1_stage2_conv14_fwd | Convolution | [1,528,28,28] | 9296 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
102 | densenet1_stage2_batchnorm15_fwd | BatchNorm | [1,192,28,28] | 98.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
103 | densenet1_stage2_relu15_fwd | Activation | [1,192,28,28] | 18.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
104 | densenet1_stage2_conv15_fwd | Convolution | [1,192,28,28] | 9894 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 58.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
104 | densenet1_stage2_conv15_fwd | Convolution | [1,192,28,28] | 9894 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
105 | densenet1_stage2_concat7 | Concat | [1,528,28,28] | 118 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 20.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
105 | densenet1_stage2_concat7 | Concat | [1,528,28,28] | 118 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 16.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
106 | densenet1_stage2_batchnorm16_fwd | BatchNorm | [1,576,28,28] | 57.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 31.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
107 | densenet1_stage2_relu16_fwd | Activation | [1,576,28,28] | 35.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 23.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
108 | densenet1_stage2_conv16_fwd | Convolution | [1,576,28,28] | 10744 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 172.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
108 | densenet1_stage2_conv16_fwd | Convolution | [1,576,28,28] | 10744 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
109 | densenet1_stage2_batchnorm17_fwd | BatchNorm | [1,192,28,28] | 96 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
110 | densenet1_stage2_relu17_fwd | Activation | [1,192,28,28] | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
111 | densenet1_stage2_conv17_fwd | Convolution | [1,192,28,28] | 9915.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 58.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
111 | densenet1_stage2_conv17_fwd | Convolution | [1,192,28,28] | 9915.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
112 | densenet1_stage2_concat8 | Concat | [1,576,28,28] | 113.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 21.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
112 | densenet1_stage2_concat8 | Concat | [1,576,28,28] | 113.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 17.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
113 | densenet1_stage2_batchnorm18_fwd | BatchNorm | [1,624,28,28] | 73.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 33.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
114 | densenet1_stage2_relu18_fwd | Activation | [1,624,28,28] | 38.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 27.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
115 | densenet1_stage2_conv18_fwd | Convolution | [1,624,28,28] | 11247 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 184.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
115 | densenet1_stage2_conv18_fwd | Convolution | [1,624,28,28] | 11247 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
116 | densenet1_stage2_batchnorm19_fwd | BatchNorm | [1,192,28,28] | 99 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
117 | densenet1_stage2_relu19_fwd | Activation | [1,192,28,28] | 18.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
118 | densenet1_stage2_conv19_fwd | Convolution | [1,192,28,28] | 9905.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 58.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
118 | densenet1_stage2_conv19_fwd | Convolution | [1,192,28,28] | 9905.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
119 | densenet1_stage2_concat9 | Concat | [1,624,28,28] | 117.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 22.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
119 | densenet1_stage2_concat9 | Concat | [1,624,28,28] | 117.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 18.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
120 | densenet1_stage2_batchnorm20_fwd | BatchNorm | [1,672,28,28] | 62.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 36.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
121 | densenet1_stage2_relu20_fwd | Activation | [1,672,28,28] | 39.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
122 | densenet1_stage2_conv20_fwd | Convolution | [1,672,28,28] | 12105.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 197.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
122 | densenet1_stage2_conv20_fwd | Convolution | [1,672,28,28] | 12105.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
123 | densenet1_stage2_batchnorm21_fwd | BatchNorm | [1,192,28,28] | 95 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
124 | densenet1_stage2_relu21_fwd | Activation | [1,192,28,28] | 18.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
125 | densenet1_stage2_conv21_fwd | Convolution | [1,192,28,28] | 9893 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 58.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
125 | densenet1_stage2_conv21_fwd | Convolution | [1,192,28,28] | 9893 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
126 | densenet1_stage2_concat10 | Concat | [1,672,28,28] | 116 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 24.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
126 | densenet1_stage2_concat10 | Concat | [1,672,28,28] | 116 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 19.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
127 | densenet1_stage2_batchnorm22_fwd | BatchNorm | [1,720,28,28] | 84 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 37.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
128 | densenet1_stage2_relu22_fwd | Activation | [1,720,28,28] | 42 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 34.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
129 | densenet1_stage2_conv22_fwd | Convolution | [1,720,28,28] | 12960.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 209.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
129 | densenet1_stage2_conv22_fwd | Convolution | [1,720,28,28] | 12960.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
130 | densenet1_stage2_batchnorm23_fwd | BatchNorm | [1,192,28,28] | 98 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
131 | densenet1_stage2_relu23_fwd | Activation | [1,192,28,28] | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
132 | densenet1_stage2_conv23_fwd | Convolution | [1,192,28,28] | 9905 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 58.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
132 | densenet1_stage2_conv23_fwd | Convolution | [1,192,28,28] | 9905 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
133 | densenet1_stage2_concat11 | Concat | [1,720,28,28] | 123 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 26.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
133 | densenet1_stage2_concat11 | Concat | [1,720,28,28] | 123 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 21.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
134 | densenet1_batchnorm2_fwd | BatchNorm | [1,768,28,28] | 67 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 40.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
135 | densenet1_relu2_fwd | Activation | [1,768,28,28] | 44.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
136 | densenet1_conv2_fwd | Convolution | [1,768,28,28] | 27024 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x128_relu_interior_nn | 276.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
136 | densenet1_conv2_fwd | Convolution | [1,768,28,28] | 27024 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
137 | densenet1_pool2_fwd | Pooling | [1,384,28,28] | 907 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 17.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
138 | densenet1_stage3_batchnorm0_fwd | BatchNorm | [1,384,14,14] | 98.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
139 | densenet1_stage3_relu0_fwd | Activation | [1,384,14,14] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
140 | densenet1_stage3_conv0_fwd | Convolution | [1,384,14,14] | 1761 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 111.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
140 | densenet1_stage3_conv0_fwd | Convolution | [1,384,14,14] | 1761 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
141 | densenet1_stage3_batchnorm1_fwd | BatchNorm | [1,192,14,14] | 46.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
142 | densenet1_stage3_relu1_fwd | Activation | [1,192,14,14] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
143 | densenet1_stage3_conv1_fwd | Convolution | [1,192,14,14] | 2566.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 60.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
143 | densenet1_stage3_conv1_fwd | Convolution | [1,192,14,14] | 2566.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
144 | densenet1_stage3_concat0 | Concat | [1,384,14,14] | 74.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
144 | densenet1_stage3_concat0 | Concat | [1,384,14,14] | 74.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
145 | densenet1_stage3_batchnorm2_fwd | BatchNorm | [1,432,14,14] | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
146 | densenet1_stage3_relu2_fwd | Activation | [1,432,14,14] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
147 | densenet1_stage3_conv2_fwd | Convolution | [1,432,14,14] | 1982 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 120.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
147 | densenet1_stage3_conv2_fwd | Convolution | [1,432,14,14] | 1982 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
148 | densenet1_stage3_batchnorm3_fwd | BatchNorm | [1,192,14,14] | 47 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
149 | densenet1_stage3_relu3_fwd | Activation | [1,192,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
150 | densenet1_stage3_conv3_fwd | Convolution | [1,192,14,14] | 2558.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
150 | densenet1_stage3_conv3_fwd | Convolution | [1,192,14,14] | 2558.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
151 | densenet1_stage3_concat1 | Concat | [1,432,14,14] | 77.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
151 | densenet1_stage3_concat1 | Concat | [1,432,14,14] | 77.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
152 | densenet1_stage3_batchnorm4_fwd | BatchNorm | [1,480,14,14] | 28.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
153 | densenet1_stage3_relu4_fwd | Activation | [1,480,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
154 | densenet1_stage3_conv4_fwd | Convolution | [1,480,14,14] | 2198.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 131.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
154 | densenet1_stage3_conv4_fwd | Convolution | [1,480,14,14] | 2198.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
155 | densenet1_stage3_batchnorm5_fwd | BatchNorm | [1,192,14,14] | 47.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
156 | densenet1_stage3_relu5_fwd | Activation | [1,192,14,14] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
157 | densenet1_stage3_conv5_fwd | Convolution | [1,192,14,14] | 2572.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
157 | densenet1_stage3_conv5_fwd | Convolution | [1,192,14,14] | 2572.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
158 | densenet1_stage3_concat2 | Concat | [1,480,14,14] | 81 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
158 | densenet1_stage3_concat2 | Concat | [1,480,14,14] | 81 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
159 | densenet1_stage3_batchnorm6_fwd | BatchNorm | [1,528,14,14] | 29.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
160 | densenet1_stage3_relu6_fwd | Activation | [1,528,14,14] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
161 | densenet1_stage3_conv6_fwd | Convolution | [1,528,14,14] | 2410.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 143.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
161 | densenet1_stage3_conv6_fwd | Convolution | [1,528,14,14] | 2410.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
162 | densenet1_stage3_batchnorm7_fwd | BatchNorm | [1,192,14,14] | 65.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
163 | densenet1_stage3_relu7_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
164 | densenet1_stage3_conv7_fwd | Convolution | [1,192,14,14] | 2563 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
164 | densenet1_stage3_conv7_fwd | Convolution | [1,192,14,14] | 2563 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
165 | densenet1_stage3_concat3 | Concat | [1,528,14,14] | 73 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
165 | densenet1_stage3_concat3 | Concat | [1,528,14,14] | 73 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
166 | densenet1_stage3_batchnorm8_fwd | BatchNorm | [1,576,14,14] | 30.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
167 | densenet1_stage3_relu8_fwd | Activation | [1,576,14,14] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
168 | densenet1_stage3_conv8_fwd | Convolution | [1,576,14,14] | 2717 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 154.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
168 | densenet1_stage3_conv8_fwd | Convolution | [1,576,14,14] | 2717 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
169 | densenet1_stage3_batchnorm9_fwd | BatchNorm | [1,192,14,14] | 75.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
170 | densenet1_stage3_relu9_fwd | Activation | [1,192,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
171 | densenet1_stage3_conv9_fwd | Convolution | [1,192,14,14] | 2567 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
171 | densenet1_stage3_conv9_fwd | Convolution | [1,192,14,14] | 2567 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
172 | densenet1_stage3_concat4 | Concat | [1,576,14,14] | 76.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
172 | densenet1_stage3_concat4 | Concat | [1,576,14,14] | 76.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
173 | densenet1_stage3_batchnorm10_fwd | BatchNorm | [1,624,14,14] | 31.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
174 | densenet1_stage3_relu10_fwd | Activation | [1,624,14,14] | 17.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
175 | densenet1_stage3_conv10_fwd | Convolution | [1,624,14,14] | 2919.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 166.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
175 | densenet1_stage3_conv10_fwd | Convolution | [1,624,14,14] | 2919.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
176 | densenet1_stage3_batchnorm11_fwd | BatchNorm | [1,192,14,14] | 70.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
177 | densenet1_stage3_relu11_fwd | Activation | [1,192,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
178 | densenet1_stage3_conv11_fwd | Convolution | [1,192,14,14] | 2565 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
178 | densenet1_stage3_conv11_fwd | Convolution | [1,192,14,14] | 2565 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
179 | densenet1_stage3_concat5 | Concat | [1,624,14,14] | 81.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
179 | densenet1_stage3_concat5 | Concat | [1,624,14,14] | 81.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
180 | densenet1_stage3_batchnorm12_fwd | BatchNorm | [1,672,14,14] | 32.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
181 | densenet1_stage3_relu12_fwd | Activation | [1,672,14,14] | 18.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
182 | densenet1_stage3_conv12_fwd | Convolution | [1,672,14,14] | 3133.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 176.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
182 | densenet1_stage3_conv12_fwd | Convolution | [1,672,14,14] | 3133.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
183 | densenet1_stage3_batchnorm13_fwd | BatchNorm | [1,192,14,14] | 76.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
184 | densenet1_stage3_relu13_fwd | Activation | [1,192,14,14] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
185 | densenet1_stage3_conv13_fwd | Convolution | [1,192,14,14] | 2565 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
185 | densenet1_stage3_conv13_fwd | Convolution | [1,192,14,14] | 2565 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
186 | densenet1_stage3_concat6 | Concat | [1,672,14,14] | 75 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
186 | densenet1_stage3_concat6 | Concat | [1,672,14,14] | 75 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
187 | densenet1_stage3_batchnorm14_fwd | BatchNorm | [1,720,14,14] | 33.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
188 | densenet1_stage3_relu14_fwd | Activation | [1,720,14,14] | 18.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
189 | densenet1_stage3_conv14_fwd | Convolution | [1,720,14,14] | 3352.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 188.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
189 | densenet1_stage3_conv14_fwd | Convolution | [1,720,14,14] | 3352.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
190 | densenet1_stage3_batchnorm15_fwd | BatchNorm | [1,192,14,14] | 73.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
191 | densenet1_stage3_relu15_fwd | Activation | [1,192,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
192 | densenet1_stage3_conv15_fwd | Convolution | [1,192,14,14] | 2564.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
192 | densenet1_stage3_conv15_fwd | Convolution | [1,192,14,14] | 2564.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
193 | densenet1_stage3_concat7 | Concat | [1,720,14,14] | 79.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
193 | densenet1_stage3_concat7 | Concat | [1,720,14,14] | 79.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
194 | densenet1_stage3_batchnorm16_fwd | BatchNorm | [1,768,14,14] | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
195 | densenet1_stage3_relu16_fwd | Activation | [1,768,14,14] | 18.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
196 | densenet1_stage3_conv16_fwd | Convolution | [1,768,14,14] | 3583.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 199.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
196 | densenet1_stage3_conv16_fwd | Convolution | [1,768,14,14] | 3583.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
197 | densenet1_stage3_batchnorm17_fwd | BatchNorm | [1,192,14,14] | 81.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
198 | densenet1_stage3_relu17_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
199 | densenet1_stage3_conv17_fwd | Convolution | [1,192,14,14] | 2557.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
199 | densenet1_stage3_conv17_fwd | Convolution | [1,192,14,14] | 2557.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
200 | densenet1_stage3_concat8 | Concat | [1,768,14,14] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
200 | densenet1_stage3_concat8 | Concat | [1,768,14,14] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
201 | densenet1_stage3_batchnorm18_fwd | BatchNorm | [1,816,14,14] | 35.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
202 | densenet1_stage3_relu18_fwd | Activation | [1,816,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
203 | densenet1_stage3_conv18_fwd | Convolution | [1,816,14,14] | 3799.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 211.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
203 | densenet1_stage3_conv18_fwd | Convolution | [1,816,14,14] | 3799.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
204 | densenet1_stage3_batchnorm19_fwd | BatchNorm | [1,192,14,14] | 77.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
205 | densenet1_stage3_relu19_fwd | Activation | [1,192,14,14] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
206 | densenet1_stage3_conv19_fwd | Convolution | [1,192,14,14] | 2583.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
206 | densenet1_stage3_conv19_fwd | Convolution | [1,192,14,14] | 2583.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
207 | densenet1_stage3_concat9 | Concat | [1,816,14,14] | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
207 | densenet1_stage3_concat9 | Concat | [1,816,14,14] | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
208 | densenet1_stage3_batchnorm20_fwd | BatchNorm | [1,864,14,14] | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
209 | densenet1_stage3_relu20_fwd | Activation | [1,864,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
210 | densenet1_stage3_conv20_fwd | Convolution | [1,864,14,14] | 4017 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 222.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
210 | densenet1_stage3_conv20_fwd | Convolution | [1,864,14,14] | 4017 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
211 | densenet1_stage3_batchnorm21_fwd | BatchNorm | [1,192,14,14] | 82 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
212 | densenet1_stage3_relu21_fwd | Activation | [1,192,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
213 | densenet1_stage3_conv21_fwd | Convolution | [1,192,14,14] | 2576.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
213 | densenet1_stage3_conv21_fwd | Convolution | [1,192,14,14] | 2576.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
214 | densenet1_stage3_concat10 | Concat | [1,864,14,14] | 81.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
214 | densenet1_stage3_concat10 | Concat | [1,864,14,14] | 81.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
215 | densenet1_stage3_batchnorm22_fwd | BatchNorm | [1,912,14,14] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
216 | densenet1_stage3_relu22_fwd | Activation | [1,912,14,14] | 20.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
217 | densenet1_stage3_conv22_fwd | Convolution | [1,912,14,14] | 4223 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 234.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
217 | densenet1_stage3_conv22_fwd | Convolution | [1,912,14,14] | 4223 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
218 | densenet1_stage3_batchnorm23_fwd | BatchNorm | [1,192,14,14] | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
219 | densenet1_stage3_relu23_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
220 | densenet1_stage3_conv23_fwd | Convolution | [1,192,14,14] | 2561.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
220 | densenet1_stage3_conv23_fwd | Convolution | [1,192,14,14] | 2561.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
221 | densenet1_stage3_concat11 | Concat | [1,912,14,14] | 77 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
221 | densenet1_stage3_concat11 | Concat | [1,912,14,14] | 77 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
222 | densenet1_stage3_batchnorm24_fwd | BatchNorm | [1,960,14,14] | 38 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
223 | densenet1_stage3_relu24_fwd | Activation | [1,960,14,14] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
224 | densenet1_stage3_conv24_fwd | Convolution | [1,960,14,14] | 4449 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 245.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
224 | densenet1_stage3_conv24_fwd | Convolution | [1,960,14,14] | 4449 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
225 | densenet1_stage3_batchnorm25_fwd | BatchNorm | [1,192,14,14] | 86.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
226 | densenet1_stage3_relu25_fwd | Activation | [1,192,14,14] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
227 | densenet1_stage3_conv25_fwd | Convolution | [1,192,14,14] | 2563 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
227 | densenet1_stage3_conv25_fwd | Convolution | [1,192,14,14] | 2563 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
228 | densenet1_stage3_concat12 | Concat | [1,960,14,14] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 11.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
228 | densenet1_stage3_concat12 | Concat | [1,960,14,14] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
229 | densenet1_stage3_batchnorm26_fwd | BatchNorm | [1,1008,14,14] | 37.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
230 | densenet1_stage3_relu26_fwd | Activation | [1,1008,14,14] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
231 | densenet1_stage3_conv26_fwd | Convolution | [1,1008,14,14] | 4665 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 256.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
231 | densenet1_stage3_conv26_fwd | Convolution | [1,1008,14,14] | 4665 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
232 | densenet1_stage3_batchnorm27_fwd | BatchNorm | [1,192,14,14] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
233 | densenet1_stage3_relu27_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
234 | densenet1_stage3_conv27_fwd | Convolution | [1,192,14,14] | 2564.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
234 | densenet1_stage3_conv27_fwd | Convolution | [1,192,14,14] | 2564.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
235 | densenet1_stage3_concat13 | Concat | [1,1008,14,14] | 77.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 11.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
235 | densenet1_stage3_concat13 | Concat | [1,1008,14,14] | 77.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
236 | densenet1_stage3_batchnorm28_fwd | BatchNorm | [1,1056,14,14] | 39.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
237 | densenet1_stage3_relu28_fwd | Activation | [1,1056,14,14] | 20.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
238 | densenet1_stage3_conv28_fwd | Convolution | [1,1056,14,14] | 4892.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 268.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
238 | densenet1_stage3_conv28_fwd | Convolution | [1,1056,14,14] | 4892.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
239 | densenet1_stage3_batchnorm29_fwd | BatchNorm | [1,192,14,14] | 78.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
240 | densenet1_stage3_relu29_fwd | Activation | [1,192,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
241 | densenet1_stage3_conv29_fwd | Convolution | [1,192,14,14] | 2579.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
241 | densenet1_stage3_conv29_fwd | Convolution | [1,192,14,14] | 2579.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
242 | densenet1_stage3_concat14 | Concat | [1,1056,14,14] | 82.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 12.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
242 | densenet1_stage3_concat14 | Concat | [1,1056,14,14] | 82.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
243 | densenet1_stage3_batchnorm30_fwd | BatchNorm | [1,1104,14,14] | 41.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
244 | densenet1_stage3_relu30_fwd | Activation | [1,1104,14,14] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
245 | densenet1_stage3_conv30_fwd | Convolution | [1,1104,14,14] | 5107.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 280.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
245 | densenet1_stage3_conv30_fwd | Convolution | [1,1104,14,14] | 5107.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
246 | densenet1_stage3_batchnorm31_fwd | BatchNorm | [1,192,14,14] | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
247 | densenet1_stage3_relu31_fwd | Activation | [1,192,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
248 | densenet1_stage3_conv31_fwd | Convolution | [1,192,14,14] | 2661.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
248 | densenet1_stage3_conv31_fwd | Convolution | [1,192,14,14] | 2661.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
249 | densenet1_stage3_concat15 | Concat | [1,1104,14,14] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 12.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
249 | densenet1_stage3_concat15 | Concat | [1,1104,14,14] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
250 | densenet1_stage3_batchnorm32_fwd | BatchNorm | [1,1152,14,14] | 42.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
251 | densenet1_stage3_relu32_fwd | Activation | [1,1152,14,14] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
252 | densenet1_stage3_conv32_fwd | Convolution | [1,1152,14,14] | 5407 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 291.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
252 | densenet1_stage3_conv32_fwd | Convolution | [1,1152,14,14] | 5407 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
253 | densenet1_stage3_batchnorm33_fwd | BatchNorm | [1,192,14,14] | 81.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
254 | densenet1_stage3_relu33_fwd | Activation | [1,192,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
255 | densenet1_stage3_conv33_fwd | Convolution | [1,192,14,14] | 2589.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
255 | densenet1_stage3_conv33_fwd | Convolution | [1,192,14,14] | 2589.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
256 | densenet1_stage3_concat16 | Concat | [1,1152,14,14] | 82.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 12.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
256 | densenet1_stage3_concat16 | Concat | [1,1152,14,14] | 82.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
257 | densenet1_stage3_batchnorm34_fwd | BatchNorm | [1,1200,14,14] | 41.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
258 | densenet1_stage3_relu34_fwd | Activation | [1,1200,14,14] | 23.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
259 | densenet1_stage3_conv34_fwd | Convolution | [1,1200,14,14] | 5637.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 302.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
259 | densenet1_stage3_conv34_fwd | Convolution | [1,1200,14,14] | 5637.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
260 | densenet1_stage3_batchnorm35_fwd | BatchNorm | [1,192,14,14] | 80.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
261 | densenet1_stage3_relu35_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
262 | densenet1_stage3_conv35_fwd | Convolution | [1,192,14,14] | 2675 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
262 | densenet1_stage3_conv35_fwd | Convolution | [1,192,14,14] | 2675 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
263 | densenet1_stage3_concat17 | Concat | [1,1200,14,14] | 75.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 13.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
263 | densenet1_stage3_concat17 | Concat | [1,1200,14,14] | 75.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
264 | densenet1_stage3_batchnorm36_fwd | BatchNorm | [1,1248,14,14] | 43.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
265 | densenet1_stage3_relu36_fwd | Activation | [1,1248,14,14] | 24.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
266 | densenet1_stage3_conv36_fwd | Convolution | [1,1248,14,14] | 5934.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 313.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
266 | densenet1_stage3_conv36_fwd | Convolution | [1,1248,14,14] | 5934.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
267 | densenet1_stage3_batchnorm37_fwd | BatchNorm | [1,192,14,14] | 86.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
268 | densenet1_stage3_relu37_fwd | Activation | [1,192,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
269 | densenet1_stage3_conv37_fwd | Convolution | [1,192,14,14] | 2754 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
269 | densenet1_stage3_conv37_fwd | Convolution | [1,192,14,14] | 2754 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
270 | densenet1_stage3_concat18 | Concat | [1,1248,14,14] | 78.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 14.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
270 | densenet1_stage3_concat18 | Concat | [1,1248,14,14] | 78.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 11.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
271 | densenet1_stage3_batchnorm38_fwd | BatchNorm | [1,1296,14,14] | 43 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
272 | densenet1_stage3_relu38_fwd | Activation | [1,1296,14,14] | 24.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
273 | densenet1_stage3_conv38_fwd | Convolution | [1,1296,14,14] | 6271.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 325.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
273 | densenet1_stage3_conv38_fwd | Convolution | [1,1296,14,14] | 6271.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
274 | densenet1_stage3_batchnorm39_fwd | BatchNorm | [1,192,14,14] | 93.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
275 | densenet1_stage3_relu39_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
276 | densenet1_stage3_conv39_fwd | Convolution | [1,192,14,14] | 2674.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
276 | densenet1_stage3_conv39_fwd | Convolution | [1,192,14,14] | 2674.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
277 | densenet1_stage3_concat19 | Concat | [1,1296,14,14] | 83.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 14.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
277 | densenet1_stage3_concat19 | Concat | [1,1296,14,14] | 83.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 11.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
278 | densenet1_stage3_batchnorm40_fwd | BatchNorm | [1,1344,14,14] | 46.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 15.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
279 | densenet1_stage3_relu40_fwd | Activation | [1,1344,14,14] | 25.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
280 | densenet1_stage3_conv40_fwd | Convolution | [1,1344,14,14] | 6268 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 337.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
280 | densenet1_stage3_conv40_fwd | Convolution | [1,1344,14,14] | 6268 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
281 | densenet1_stage3_batchnorm41_fwd | BatchNorm | [1,192,14,14] | 88.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
282 | densenet1_stage3_relu41_fwd | Activation | [1,192,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
283 | densenet1_stage3_conv41_fwd | Convolution | [1,192,14,14] | 2609.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 57.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
283 | densenet1_stage3_conv41_fwd | Convolution | [1,192,14,14] | 2609.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
284 | densenet1_stage3_concat20 | Concat | [1,1344,14,14] | 86.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 14.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
284 | densenet1_stage3_concat20 | Concat | [1,1344,14,14] | 86.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
285 | densenet1_stage3_batchnorm42_fwd | BatchNorm | [1,1392,14,14] | 44.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 16.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
286 | densenet1_stage3_relu42_fwd | Activation | [1,1392,14,14] | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 8.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
287 | densenet1_stage3_conv42_fwd | Convolution | [1,1392,14,14] | 6490.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 352.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
287 | densenet1_stage3_conv42_fwd | Convolution | [1,1392,14,14] | 6490.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
288 | densenet1_stage3_batchnorm43_fwd | BatchNorm | [1,192,14,14] | 89.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
289 | densenet1_stage3_relu43_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
290 | densenet1_stage3_conv43_fwd | Convolution | [1,192,14,14] | 2857.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
290 | densenet1_stage3_conv43_fwd | Convolution | [1,192,14,14] | 2857.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
291 | densenet1_stage3_concat21 | Concat | [1,1392,14,14] | 79 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 15.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
291 | densenet1_stage3_concat21 | Concat | [1,1392,14,14] | 79 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 12.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
292 | densenet1_stage3_batchnorm44_fwd | BatchNorm | [1,1440,14,14] | 49 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 17.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
293 | densenet1_stage3_relu44_fwd | Activation | [1,1440,14,14] | 26.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 8.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
294 | densenet1_stage3_conv44_fwd | Convolution | [1,1440,14,14] | 6938 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 370.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
294 | densenet1_stage3_conv44_fwd | Convolution | [1,1440,14,14] | 6938 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
295 | densenet1_stage3_batchnorm45_fwd | BatchNorm | [1,192,14,14] | 88.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
296 | densenet1_stage3_relu45_fwd | Activation | [1,192,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
297 | densenet1_stage3_conv45_fwd | Convolution | [1,192,14,14] | 2586 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
297 | densenet1_stage3_conv45_fwd | Convolution | [1,192,14,14] | 2586 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
298 | densenet1_stage3_concat22 | Concat | [1,1440,14,14] | 79.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 16.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
298 | densenet1_stage3_concat22 | Concat | [1,1440,14,14] | 79.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
299 | densenet1_stage3_batchnorm46_fwd | BatchNorm | [1,1488,14,14] | 46.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 18.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
300 | densenet1_stage3_relu46_fwd | Activation | [1,1488,14,14] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 8.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
301 | densenet1_stage3_conv46_fwd | Convolution | [1,1488,14,14] | 6972.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 381.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
301 | densenet1_stage3_conv46_fwd | Convolution | [1,1488,14,14] | 6972.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
302 | densenet1_stage3_batchnorm47_fwd | BatchNorm | [1,192,14,14] | 90 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
303 | densenet1_stage3_relu47_fwd | Activation | [1,192,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
304 | densenet1_stage3_conv47_fwd | Convolution | [1,192,14,14] | 2578.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
304 | densenet1_stage3_conv47_fwd | Convolution | [1,192,14,14] | 2578.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
305 | densenet1_stage3_concat23 | Concat | [1,1488,14,14] | 82.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 16.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
305 | densenet1_stage3_concat23 | Concat | [1,1488,14,14] | 82.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 12.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
306 | densenet1_stage3_batchnorm48_fwd | BatchNorm | [1,1536,14,14] | 49 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 19.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
307 | densenet1_stage3_relu48_fwd | Activation | [1,1536,14,14] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
308 | densenet1_stage3_conv48_fwd | Convolution | [1,1536,14,14] | 7100.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 396.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
308 | densenet1_stage3_conv48_fwd | Convolution | [1,1536,14,14] | 7100.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
309 | densenet1_stage3_batchnorm49_fwd | BatchNorm | [1,192,14,14] | 87.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
310 | densenet1_stage3_relu49_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
311 | densenet1_stage3_conv49_fwd | Convolution | [1,192,14,14] | 2560 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
311 | densenet1_stage3_conv49_fwd | Convolution | [1,192,14,14] | 2560 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
312 | densenet1_stage3_concat24 | Concat | [1,1536,14,14] | 78.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 16.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
312 | densenet1_stage3_concat24 | Concat | [1,1536,14,14] | 78.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 13.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
313 | densenet1_stage3_batchnorm50_fwd | BatchNorm | [1,1584,14,14] | 47 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 20.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
314 | densenet1_stage3_relu50_fwd | Activation | [1,1584,14,14] | 27.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
315 | densenet1_stage3_conv50_fwd | Convolution | [1,1584,14,14] | 7304 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 413.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
315 | densenet1_stage3_conv50_fwd | Convolution | [1,1584,14,14] | 7304 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
316 | densenet1_stage3_batchnorm51_fwd | BatchNorm | [1,192,14,14] | 88.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
317 | densenet1_stage3_relu51_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
318 | densenet1_stage3_conv51_fwd | Convolution | [1,192,14,14] | 2585.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
318 | densenet1_stage3_conv51_fwd | Convolution | [1,192,14,14] | 2585.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
319 | densenet1_stage3_concat25 | Concat | [1,1584,14,14] | 82 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 17.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
319 | densenet1_stage3_concat25 | Concat | [1,1584,14,14] | 82 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 13.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
320 | densenet1_stage3_batchnorm52_fwd | BatchNorm | [1,1632,14,14] | 51.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 21.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
321 | densenet1_stage3_relu52_fwd | Activation | [1,1632,14,14] | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
322 | densenet1_stage3_conv52_fwd | Convolution | [1,1632,14,14] | 7538.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 427.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
322 | densenet1_stage3_conv52_fwd | Convolution | [1,1632,14,14] | 7538.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
323 | densenet1_stage3_batchnorm53_fwd | BatchNorm | [1,192,14,14] | 90 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
324 | densenet1_stage3_relu53_fwd | Activation | [1,192,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
325 | densenet1_stage3_conv53_fwd | Convolution | [1,192,14,14] | 2571 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
325 | densenet1_stage3_conv53_fwd | Convolution | [1,192,14,14] | 2571 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
326 | densenet1_stage3_concat26 | Concat | [1,1632,14,14] | 91 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 16.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
326 | densenet1_stage3_concat26 | Concat | [1,1632,14,14] | 91 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 13.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
327 | densenet1_stage3_batchnorm54_fwd | BatchNorm | [1,1680,14,14] | 46.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 23.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
328 | densenet1_stage3_relu54_fwd | Activation | [1,1680,14,14] | 28.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 10.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
329 | densenet1_stage3_conv54_fwd | Convolution | [1,1680,14,14] | 7752 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 442.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
329 | densenet1_stage3_conv54_fwd | Convolution | [1,1680,14,14] | 7752 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
330 | densenet1_stage3_batchnorm55_fwd | BatchNorm | [1,192,14,14] | 88.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
331 | densenet1_stage3_relu55_fwd | Activation | [1,192,14,14] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
332 | densenet1_stage3_conv55_fwd | Convolution | [1,192,14,14] | 2575.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
332 | densenet1_stage3_conv55_fwd | Convolution | [1,192,14,14] | 2575.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
333 | densenet1_stage3_concat27 | Concat | [1,1680,14,14] | 83.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 17.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
333 | densenet1_stage3_concat27 | Concat | [1,1680,14,14] | 83.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 14.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
334 | densenet1_stage3_batchnorm56_fwd | BatchNorm | [1,1728,14,14] | 52 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 23.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
335 | densenet1_stage3_relu56_fwd | Activation | [1,1728,14,14] | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
336 | densenet1_stage3_conv56_fwd | Convolution | [1,1728,14,14] | 8003 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 457.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
336 | densenet1_stage3_conv56_fwd | Convolution | [1,1728,14,14] | 8003 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
337 | densenet1_stage3_batchnorm57_fwd | BatchNorm | [1,192,14,14] | 89.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
338 | densenet1_stage3_relu57_fwd | Activation | [1,192,14,14] | 14.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
339 | densenet1_stage3_conv57_fwd | Convolution | [1,192,14,14] | 2633.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
339 | densenet1_stage3_conv57_fwd | Convolution | [1,192,14,14] | 2633.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
340 | densenet1_stage3_concat28 | Concat | [1,1728,14,14] | 82.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 17.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
340 | densenet1_stage3_concat28 | Concat | [1,1728,14,14] | 82.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 14.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
341 | densenet1_stage3_batchnorm58_fwd | BatchNorm | [1,1776,14,14] | 51.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 24.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
342 | densenet1_stage3_relu58_fwd | Activation | [1,1776,14,14] | 30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
343 | densenet1_stage3_conv58_fwd | Convolution | [1,1776,14,14] | 8249.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 472.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
343 | densenet1_stage3_conv58_fwd | Convolution | [1,1776,14,14] | 8249.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
344 | densenet1_stage3_batchnorm59_fwd | BatchNorm | [1,192,14,14] | 91 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
345 | densenet1_stage3_relu59_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
346 | densenet1_stage3_conv59_fwd | Convolution | [1,192,14,14] | 2574 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
346 | densenet1_stage3_conv59_fwd | Convolution | [1,192,14,14] | 2574 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
347 | densenet1_stage3_concat29 | Concat | [1,1776,14,14] | 92 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 18.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
347 | densenet1_stage3_concat29 | Concat | [1,1776,14,14] | 92 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 15.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
348 | densenet1_stage3_batchnorm60_fwd | BatchNorm | [1,1824,14,14] | 54.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 26.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
349 | densenet1_stage3_relu60_fwd | Activation | [1,1824,14,14] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
350 | densenet1_stage3_conv60_fwd | Convolution | [1,1824,14,14] | 8450 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 484.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
350 | densenet1_stage3_conv60_fwd | Convolution | [1,1824,14,14] | 8450 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
351 | densenet1_stage3_batchnorm61_fwd | BatchNorm | [1,192,14,14] | 90 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
352 | densenet1_stage3_relu61_fwd | Activation | [1,192,14,14] | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
353 | densenet1_stage3_conv61_fwd | Convolution | [1,192,14,14] | 2586 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
353 | densenet1_stage3_conv61_fwd | Convolution | [1,192,14,14] | 2586 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
354 | densenet1_stage3_concat30 | Concat | [1,1824,14,14] | 89.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 19.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
354 | densenet1_stage3_concat30 | Concat | [1,1824,14,14] | 89.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 15.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
355 | densenet1_stage3_batchnorm62_fwd | BatchNorm | [1,1872,14,14] | 51.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 26.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
356 | densenet1_stage3_relu62_fwd | Activation | [1,1872,14,14] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 14.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
357 | densenet1_stage3_conv62_fwd | Convolution | [1,1872,14,14] | 8698.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 500.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
357 | densenet1_stage3_conv62_fwd | Convolution | [1,1872,14,14] | 8698.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
358 | densenet1_stage3_batchnorm63_fwd | BatchNorm | [1,192,14,14] | 89 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
359 | densenet1_stage3_relu63_fwd | Activation | [1,192,14,14] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
360 | densenet1_stage3_conv63_fwd | Convolution | [1,192,14,14] | 2631.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
360 | densenet1_stage3_conv63_fwd | Convolution | [1,192,14,14] | 2631.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
361 | densenet1_stage3_concat31 | Concat | [1,1872,14,14] | 83 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 19.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
361 | densenet1_stage3_concat31 | Concat | [1,1872,14,14] | 83 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 15.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
362 | densenet1_stage3_batchnorm64_fwd | BatchNorm | [1,1920,14,14] | 57.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 27.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
363 | densenet1_stage3_relu64_fwd | Activation | [1,1920,14,14] | 31.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 15.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
364 | densenet1_stage3_conv64_fwd | Convolution | [1,1920,14,14] | 9056 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 517.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
364 | densenet1_stage3_conv64_fwd | Convolution | [1,1920,14,14] | 9056 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
365 | densenet1_stage3_batchnorm65_fwd | BatchNorm | [1,192,14,14] | 88.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
366 | densenet1_stage3_relu65_fwd | Activation | [1,192,14,14] | 14.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
367 | densenet1_stage3_conv65_fwd | Convolution | [1,192,14,14] | 2665 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
367 | densenet1_stage3_conv65_fwd | Convolution | [1,192,14,14] | 2665 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
368 | densenet1_stage3_concat32 | Concat | [1,1920,14,14] | 86 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 19.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
368 | densenet1_stage3_concat32 | Concat | [1,1920,14,14] | 86 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 15.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
369 | densenet1_stage3_batchnorm66_fwd | BatchNorm | [1,1968,14,14] | 51 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 28.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
370 | densenet1_stage3_relu66_fwd | Activation | [1,1968,14,14] | 32 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 16.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
371 | densenet1_stage3_conv66_fwd | Convolution | [1,1968,14,14] | 9257.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 530.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
371 | densenet1_stage3_conv66_fwd | Convolution | [1,1968,14,14] | 9257.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
372 | densenet1_stage3_batchnorm67_fwd | BatchNorm | [1,192,14,14] | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
373 | densenet1_stage3_relu67_fwd | Activation | [1,192,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
374 | densenet1_stage3_conv67_fwd | Convolution | [1,192,14,14] | 2648.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
374 | densenet1_stage3_conv67_fwd | Convolution | [1,192,14,14] | 2648.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
375 | densenet1_stage3_concat33 | Concat | [1,1968,14,14] | 85.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 20.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
375 | densenet1_stage3_concat33 | Concat | [1,1968,14,14] | 85.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 16.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
376 | densenet1_stage3_batchnorm68_fwd | BatchNorm | [1,2016,14,14] | 56.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 29.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
377 | densenet1_stage3_relu68_fwd | Activation | [1,2016,14,14] | 31.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 17.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
378 | densenet1_stage3_conv68_fwd | Convolution | [1,2016,14,14] | 9639.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 539.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
378 | densenet1_stage3_conv68_fwd | Convolution | [1,2016,14,14] | 9639.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
379 | densenet1_stage3_batchnorm69_fwd | BatchNorm | [1,192,14,14] | 89 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
380 | densenet1_stage3_relu69_fwd | Activation | [1,192,14,14] | 14.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
381 | densenet1_stage3_conv69_fwd | Convolution | [1,192,14,14] | 2903 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
381 | densenet1_stage3_conv69_fwd | Convolution | [1,192,14,14] | 2903 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
382 | densenet1_stage3_concat34 | Concat | [1,2016,14,14] | 88.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 20.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
382 | densenet1_stage3_concat34 | Concat | [1,2016,14,14] | 88.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 16.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
383 | densenet1_stage3_batchnorm70_fwd | BatchNorm | [1,2064,14,14] | 53.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
384 | densenet1_stage3_relu70_fwd | Activation | [1,2064,14,14] | 33.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 18.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
385 | densenet1_stage3_conv70_fwd | Convolution | [1,2064,14,14] | 9851.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 552.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
385 | densenet1_stage3_conv70_fwd | Convolution | [1,2064,14,14] | 9851.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
386 | densenet1_stage3_batchnorm71_fwd | BatchNorm | [1,192,14,14] | 91.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
387 | densenet1_stage3_relu71_fwd | Activation | [1,192,14,14] | 14.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
388 | densenet1_stage3_conv71_fwd | Convolution | [1,192,14,14] | 2905.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
388 | densenet1_stage3_conv71_fwd | Convolution | [1,192,14,14] | 2905.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
389 | densenet1_stage3_concat35 | Concat | [1,2064,14,14] | 89 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 21.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
389 | densenet1_stage3_concat35 | Concat | [1,2064,14,14] | 89 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 16.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
390 | densenet1_batchnorm3_fwd | BatchNorm | [1,2112,14,14] | 62.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 30.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
391 | densenet1_relu3_fwd | Activation | [1,2112,14,14] | 33.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 18.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
392 | densenet1_conv3_fwd | Convolution | [1,2112,14,14] | 51624 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x128_relu_interior_nn | 481.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
392 | densenet1_conv3_fwd | Convolution | [1,2112,14,14] | 51624 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
393 | densenet1_pool3_fwd | Pooling | [1,1056,14,14] | 632.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 15.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
394 | densenet1_stage4_batchnorm0_fwd | BatchNorm | [1,1056,7,7] | 97.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
395 | densenet1_stage4_relu0_fwd | Activation | [1,1056,7,7] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
396 | densenet1_stage4_conv0_fwd | Convolution | [1,1056,7,7] | 1788 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 133.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
396 | densenet1_stage4_conv0_fwd | Convolution | [1,1056,7,7] | 1788 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 105.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
397 | densenet1_stage4_batchnorm1_fwd | BatchNorm | [1,192,7,7] | 35.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
398 | densenet1_stage4_relu1_fwd | Activation | [1,192,7,7] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
399 | densenet1_stage4_conv1_fwd | Convolution | [1,192,7,7] | 948 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 60.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
399 | densenet1_stage4_conv1_fwd | Convolution | [1,192,7,7] | 948 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 13.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
400 | densenet1_stage4_concat0 | Concat | [1,1056,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
400 | densenet1_stage4_concat0 | Concat | [1,1056,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
401 | densenet1_stage4_batchnorm2_fwd | BatchNorm | [1,1104,7,7] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
402 | densenet1_stage4_relu2_fwd | Activation | [1,1104,7,7] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
403 | densenet1_stage4_conv2_fwd | Convolution | [1,1104,7,7] | 1858 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
403 | densenet1_stage4_conv2_fwd | Convolution | [1,1104,7,7] | 1858 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 107.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
404 | densenet1_stage4_batchnorm3_fwd | BatchNorm | [1,192,7,7] | 41.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
405 | densenet1_stage4_relu3_fwd | Activation | [1,192,7,7] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
406 | densenet1_stage4_conv3_fwd | Convolution | [1,192,7,7] | 941 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
406 | densenet1_stage4_conv3_fwd | Convolution | [1,192,7,7] | 941 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
407 | densenet1_stage4_concat1 | Concat | [1,1104,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
407 | densenet1_stage4_concat1 | Concat | [1,1104,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
408 | densenet1_stage4_batchnorm4_fwd | BatchNorm | [1,1152,7,7] | 27.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
409 | densenet1_stage4_relu4_fwd | Activation | [1,1152,7,7] | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
410 | densenet1_stage4_conv4_fwd | Convolution | [1,1152,7,7] | 1937.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
410 | densenet1_stage4_conv4_fwd | Convolution | [1,1152,7,7] | 1937.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 113.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
411 | densenet1_stage4_batchnorm5_fwd | BatchNorm | [1,192,7,7] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
412 | densenet1_stage4_relu5_fwd | Activation | [1,192,7,7] | 14.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
413 | densenet1_stage4_conv5_fwd | Convolution | [1,192,7,7] | 939.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
413 | densenet1_stage4_conv5_fwd | Convolution | [1,192,7,7] | 939.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
414 | densenet1_stage4_concat2 | Concat | [1,1152,7,7] | 31.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
414 | densenet1_stage4_concat2 | Concat | [1,1152,7,7] | 31.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
415 | densenet1_stage4_batchnorm6_fwd | BatchNorm | [1,1200,7,7] | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
416 | densenet1_stage4_relu6_fwd | Activation | [1,1200,7,7] | 16.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
417 | densenet1_stage4_conv6_fwd | Convolution | [1,1200,7,7] | 2013 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 304.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
417 | densenet1_stage4_conv6_fwd | Convolution | [1,1200,7,7] | 2013 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
418 | densenet1_stage4_batchnorm7_fwd | BatchNorm | [1,192,7,7] | 37.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
419 | densenet1_stage4_relu7_fwd | Activation | [1,192,7,7] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
420 | densenet1_stage4_conv7_fwd | Convolution | [1,192,7,7] | 951.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
420 | densenet1_stage4_conv7_fwd | Convolution | [1,192,7,7] | 951.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
421 | densenet1_stage4_concat3 | Concat | [1,1200,7,7] | 30.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
421 | densenet1_stage4_concat3 | Concat | [1,1200,7,7] | 30.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
422 | densenet1_stage4_batchnorm8_fwd | BatchNorm | [1,1248,7,7] | 26.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
423 | densenet1_stage4_relu8_fwd | Activation | [1,1248,7,7] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
424 | densenet1_stage4_conv8_fwd | Convolution | [1,1248,7,7] | 2079 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 313.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
424 | densenet1_stage4_conv8_fwd | Convolution | [1,1248,7,7] | 2079 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
425 | densenet1_stage4_batchnorm9_fwd | BatchNorm | [1,192,7,7] | 54.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
426 | densenet1_stage4_relu9_fwd | Activation | [1,192,7,7] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
427 | densenet1_stage4_conv9_fwd | Convolution | [1,192,7,7] | 947.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
427 | densenet1_stage4_conv9_fwd | Convolution | [1,192,7,7] | 947.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
428 | densenet1_stage4_concat4 | Concat | [1,1248,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
428 | densenet1_stage4_concat4 | Concat | [1,1248,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
429 | densenet1_stage4_batchnorm10_fwd | BatchNorm | [1,1296,7,7] | 26.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
430 | densenet1_stage4_relu10_fwd | Activation | [1,1296,7,7] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
431 | densenet1_stage4_conv10_fwd | Convolution | [1,1296,7,7] | 2138.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 324.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
431 | densenet1_stage4_conv10_fwd | Convolution | [1,1296,7,7] | 2138.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
432 | densenet1_stage4_batchnorm11_fwd | BatchNorm | [1,192,7,7] | 55.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
433 | densenet1_stage4_relu11_fwd | Activation | [1,192,7,7] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
434 | densenet1_stage4_conv11_fwd | Convolution | [1,192,7,7] | 954.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
434 | densenet1_stage4_conv11_fwd | Convolution | [1,192,7,7] | 954.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
435 | densenet1_stage4_concat5 | Concat | [1,1296,7,7] | 31.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
435 | densenet1_stage4_concat5 | Concat | [1,1296,7,7] | 31.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
436 | densenet1_stage4_batchnorm12_fwd | BatchNorm | [1,1344,7,7] | 27.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
437 | densenet1_stage4_relu12_fwd | Activation | [1,1344,7,7] | 14.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
438 | densenet1_stage4_conv12_fwd | Convolution | [1,1344,7,7] | 2225 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 336.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
438 | densenet1_stage4_conv12_fwd | Convolution | [1,1344,7,7] | 2225 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
439 | densenet1_stage4_batchnorm13_fwd | BatchNorm | [1,192,7,7] | 56.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
440 | densenet1_stage4_relu13_fwd | Activation | [1,192,7,7] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
441 | densenet1_stage4_conv13_fwd | Convolution | [1,192,7,7] | 942.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
441 | densenet1_stage4_conv13_fwd | Convolution | [1,192,7,7] | 942.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
442 | densenet1_stage4_concat6 | Concat | [1,1344,7,7] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
442 | densenet1_stage4_concat6 | Concat | [1,1344,7,7] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
443 | densenet1_stage4_batchnorm14_fwd | BatchNorm | [1,1392,7,7] | 27.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
444 | densenet1_stage4_relu14_fwd | Activation | [1,1392,7,7] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
445 | densenet1_stage4_conv14_fwd | Convolution | [1,1392,7,7] | 2276.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 347.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
445 | densenet1_stage4_conv14_fwd | Convolution | [1,1392,7,7] | 2276.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
446 | densenet1_stage4_batchnorm15_fwd | BatchNorm | [1,192,7,7] | 55.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
447 | densenet1_stage4_relu15_fwd | Activation | [1,192,7,7] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
448 | densenet1_stage4_conv15_fwd | Convolution | [1,192,7,7] | 945.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
448 | densenet1_stage4_conv15_fwd | Convolution | [1,192,7,7] | 945.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
449 | densenet1_stage4_concat7 | Concat | [1,1392,7,7] | 30.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.71 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
449 | densenet1_stage4_concat7 | Concat | [1,1392,7,7] | 30.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
450 | densenet1_stage4_batchnorm16_fwd | BatchNorm | [1,1440,7,7] | 27.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
451 | densenet1_stage4_relu16_fwd | Activation | [1,1440,7,7] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
452 | densenet1_stage4_conv16_fwd | Convolution | [1,1440,7,7] | 2341.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 358.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
452 | densenet1_stage4_conv16_fwd | Convolution | [1,1440,7,7] | 2341.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
453 | densenet1_stage4_batchnorm17_fwd | BatchNorm | [1,192,7,7] | 55.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
454 | densenet1_stage4_relu17_fwd | Activation | [1,192,7,7] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
455 | densenet1_stage4_conv17_fwd | Convolution | [1,192,7,7] | 941.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
455 | densenet1_stage4_conv17_fwd | Convolution | [1,192,7,7] | 941.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
456 | densenet1_stage4_concat8 | Concat | [1,1440,7,7] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
456 | densenet1_stage4_concat8 | Concat | [1,1440,7,7] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
457 | densenet1_stage4_batchnorm18_fwd | BatchNorm | [1,1488,7,7] | 31.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
458 | densenet1_stage4_relu18_fwd | Activation | [1,1488,7,7] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
459 | densenet1_stage4_conv18_fwd | Convolution | [1,1488,7,7] | 2407.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 371.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
459 | densenet1_stage4_conv18_fwd | Convolution | [1,1488,7,7] | 2407.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
460 | densenet1_stage4_batchnorm19_fwd | BatchNorm | [1,192,7,7] | 56.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
461 | densenet1_stage4_relu19_fwd | Activation | [1,192,7,7] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
462 | densenet1_stage4_conv19_fwd | Convolution | [1,192,7,7] | 939.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
462 | densenet1_stage4_conv19_fwd | Convolution | [1,192,7,7] | 939.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
463 | densenet1_stage4_concat9 | Concat | [1,1488,7,7] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
463 | densenet1_stage4_concat9 | Concat | [1,1488,7,7] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
464 | densenet1_stage4_batchnorm20_fwd | BatchNorm | [1,1536,7,7] | 29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
465 | densenet1_stage4_relu20_fwd | Activation | [1,1536,7,7] | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
466 | densenet1_stage4_conv20_fwd | Convolution | [1,1536,7,7] | 2484.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 382.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
466 | densenet1_stage4_conv20_fwd | Convolution | [1,1536,7,7] | 2484.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
467 | densenet1_stage4_batchnorm21_fwd | BatchNorm | [1,192,7,7] | 55 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
468 | densenet1_stage4_relu21_fwd | Activation | [1,192,7,7] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
469 | densenet1_stage4_conv21_fwd | Convolution | [1,192,7,7] | 940.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
469 | densenet1_stage4_conv21_fwd | Convolution | [1,192,7,7] | 940.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
470 | densenet1_stage4_concat10 | Concat | [1,1536,7,7] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
470 | densenet1_stage4_concat10 | Concat | [1,1536,7,7] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
471 | densenet1_stage4_batchnorm22_fwd | BatchNorm | [1,1584,7,7] | 28.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
472 | densenet1_stage4_relu22_fwd | Activation | [1,1584,7,7] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
473 | densenet1_stage4_conv22_fwd | Convolution | [1,1584,7,7] | 2574.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 393.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
473 | densenet1_stage4_conv22_fwd | Convolution | [1,1584,7,7] | 2574.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
474 | densenet1_stage4_batchnorm23_fwd | BatchNorm | [1,192,7,7] | 67 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
475 | densenet1_stage4_relu23_fwd | Activation | [1,192,7,7] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
476 | densenet1_stage4_conv23_fwd | Convolution | [1,192,7,7] | 950.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
476 | densenet1_stage4_conv23_fwd | Convolution | [1,192,7,7] | 950.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
477 | densenet1_stage4_concat11 | Concat | [1,1584,7,7] | 28.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
477 | densenet1_stage4_concat11 | Concat | [1,1584,7,7] | 28.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
478 | densenet1_stage4_batchnorm24_fwd | BatchNorm | [1,1632,7,7] | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
479 | densenet1_stage4_relu24_fwd | Activation | [1,1632,7,7] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
480 | densenet1_stage4_conv24_fwd | Convolution | [1,1632,7,7] | 2626 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_128x64_relu_interior_nn | 405.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
480 | densenet1_stage4_conv24_fwd | Convolution | [1,1632,7,7] | 2626 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::maxwell::gemm::computeOffsetsKernel(cudnn::maxwell::gemm::ComputeOffsetsParams) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
481 | densenet1_stage4_batchnorm25_fwd | BatchNorm | [1,192,7,7] | 61.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
482 | densenet1_stage4_relu25_fwd | Activation | [1,192,7,7] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
483 | densenet1_stage4_conv25_fwd | Convolution | [1,192,7,7] | 950.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 56.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
483 | densenet1_stage4_conv25_fwd | Convolution | [1,192,7,7] | 950.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
484 | densenet1_stage4_concat12 | Concat | [1,1632,7,7] | 31 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
484 | densenet1_stage4_concat12 | Concat | [1,1632,7,7] | 31 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
485 | densenet1_stage4_batchnorm26_fwd | BatchNorm | [1,1680,7,7] | 29.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
486 | densenet1_stage4_relu26_fwd | Activation | [1,1680,7,7] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
487 | densenet1_stage4_conv26_fwd | Convolution | [1,1680,7,7] | 2687.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 163.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
487 | densenet1_stage4_conv26_fwd | Convolution | [1,1680,7,7] | 2687.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
488 | densenet1_stage4_batchnorm27_fwd | BatchNorm | [1,192,7,7] | 69.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
489 | densenet1_stage4_relu27_fwd | Activation | [1,192,7,7] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
490 | densenet1_stage4_conv27_fwd | Convolution | [1,192,7,7] | 943.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
490 | densenet1_stage4_conv27_fwd | Convolution | [1,192,7,7] | 943.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
491 | densenet1_stage4_concat13 | Concat | [1,1680,7,7] | 31.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.86 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
491 | densenet1_stage4_concat13 | Concat | [1,1680,7,7] | 31.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.14 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
492 | densenet1_stage4_batchnorm28_fwd | BatchNorm | [1,1728,7,7] | 29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
493 | densenet1_stage4_relu28_fwd | Activation | [1,1728,7,7] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
494 | densenet1_stage4_conv28_fwd | Convolution | [1,1728,7,7] | 2760.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 168.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
494 | densenet1_stage4_conv28_fwd | Convolution | [1,1728,7,7] | 2760.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
495 | densenet1_stage4_batchnorm29_fwd | BatchNorm | [1,192,7,7] | 69.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
496 | densenet1_stage4_relu29_fwd | Activation | [1,192,7,7] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
497 | densenet1_stage4_conv29_fwd | Convolution | [1,192,7,7] | 945.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
497 | densenet1_stage4_conv29_fwd | Convolution | [1,192,7,7] | 945.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
498 | densenet1_stage4_concat14 | Concat | [1,1728,7,7] | 31.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
498 | densenet1_stage4_concat14 | Concat | [1,1728,7,7] | 31.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
499 | densenet1_stage4_batchnorm30_fwd | BatchNorm | [1,1776,7,7] | 29.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
500 | densenet1_stage4_relu30_fwd | Activation | [1,1776,7,7] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
501 | densenet1_stage4_conv30_fwd | Convolution | [1,1776,7,7] | 2828 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 171.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
501 | densenet1_stage4_conv30_fwd | Convolution | [1,1776,7,7] | 2828 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
502 | densenet1_stage4_batchnorm31_fwd | BatchNorm | [1,192,7,7] | 74.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
503 | densenet1_stage4_relu31_fwd | Activation | [1,192,7,7] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
504 | densenet1_stage4_conv31_fwd | Convolution | [1,192,7,7] | 945.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
504 | densenet1_stage4_conv31_fwd | Convolution | [1,192,7,7] | 945.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
505 | densenet1_stage4_concat15 | Concat | [1,1776,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
505 | densenet1_stage4_concat15 | Concat | [1,1776,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
506 | densenet1_stage4_batchnorm32_fwd | BatchNorm | [1,1824,7,7] | 30.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
507 | densenet1_stage4_relu32_fwd | Activation | [1,1824,7,7] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
508 | densenet1_stage4_conv32_fwd | Convolution | [1,1824,7,7] | 2901 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 177.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
508 | densenet1_stage4_conv32_fwd | Convolution | [1,1824,7,7] | 2901 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
509 | densenet1_stage4_batchnorm33_fwd | BatchNorm | [1,192,7,7] | 70.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
510 | densenet1_stage4_relu33_fwd | Activation | [1,192,7,7] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
511 | densenet1_stage4_conv33_fwd | Convolution | [1,192,7,7] | 941 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
511 | densenet1_stage4_conv33_fwd | Convolution | [1,192,7,7] | 941 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
512 | densenet1_stage4_concat16 | Concat | [1,1824,7,7] | 32.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
512 | densenet1_stage4_concat16 | Concat | [1,1824,7,7] | 32.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
513 | densenet1_stage4_batchnorm34_fwd | BatchNorm | [1,1872,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
514 | densenet1_stage4_relu34_fwd | Activation | [1,1872,7,7] | 16.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
515 | densenet1_stage4_conv34_fwd | Convolution | [1,1872,7,7] | 2967.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 182.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
515 | densenet1_stage4_conv34_fwd | Convolution | [1,1872,7,7] | 2967.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
516 | densenet1_stage4_batchnorm35_fwd | BatchNorm | [1,192,7,7] | 70 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
517 | densenet1_stage4_relu35_fwd | Activation | [1,192,7,7] | 14.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
518 | densenet1_stage4_conv35_fwd | Convolution | [1,192,7,7] | 943 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
518 | densenet1_stage4_conv35_fwd | Convolution | [1,192,7,7] | 943 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
519 | densenet1_stage4_concat17 | Concat | [1,1872,7,7] | 29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
519 | densenet1_stage4_concat17 | Concat | [1,1872,7,7] | 29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
520 | densenet1_stage4_batchnorm36_fwd | BatchNorm | [1,1920,7,7] | 28.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
521 | densenet1_stage4_relu36_fwd | Activation | [1,1920,7,7] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
522 | densenet1_stage4_conv36_fwd | Convolution | [1,1920,7,7] | 3036.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 188.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
522 | densenet1_stage4_conv36_fwd | Convolution | [1,1920,7,7] | 3036.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
523 | densenet1_stage4_batchnorm37_fwd | BatchNorm | [1,192,7,7] | 76.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
524 | densenet1_stage4_relu37_fwd | Activation | [1,192,7,7] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
525 | densenet1_stage4_conv37_fwd | Convolution | [1,192,7,7] | 939 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
525 | densenet1_stage4_conv37_fwd | Convolution | [1,192,7,7] | 939 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
526 | densenet1_stage4_concat18 | Concat | [1,1920,7,7] | 33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
526 | densenet1_stage4_concat18 | Concat | [1,1920,7,7] | 33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
527 | densenet1_stage4_batchnorm38_fwd | BatchNorm | [1,1968,7,7] | 31 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
528 | densenet1_stage4_relu38_fwd | Activation | [1,1968,7,7] | 16.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
529 | densenet1_stage4_conv38_fwd | Convolution | [1,1968,7,7] | 3108.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 192.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
529 | densenet1_stage4_conv38_fwd | Convolution | [1,1968,7,7] | 3108.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
530 | densenet1_stage4_batchnorm39_fwd | BatchNorm | [1,192,7,7] | 75.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
531 | densenet1_stage4_relu39_fwd | Activation | [1,192,7,7] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
532 | densenet1_stage4_conv39_fwd | Convolution | [1,192,7,7] | 940.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
532 | densenet1_stage4_conv39_fwd | Convolution | [1,192,7,7] | 940.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
533 | densenet1_stage4_concat19 | Concat | [1,1968,7,7] | 28.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
533 | densenet1_stage4_concat19 | Concat | [1,1968,7,7] | 28.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
534 | densenet1_stage4_batchnorm40_fwd | BatchNorm | [1,2016,7,7] | 29.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
535 | densenet1_stage4_relu40_fwd | Activation | [1,2016,7,7] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
536 | densenet1_stage4_conv40_fwd | Convolution | [1,2016,7,7] | 3191.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 196.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
536 | densenet1_stage4_conv40_fwd | Convolution | [1,2016,7,7] | 3191.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
537 | densenet1_stage4_batchnorm41_fwd | BatchNorm | [1,192,7,7] | 86.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
538 | densenet1_stage4_relu41_fwd | Activation | [1,192,7,7] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
539 | densenet1_stage4_conv41_fwd | Convolution | [1,192,7,7] | 936.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
539 | densenet1_stage4_conv41_fwd | Convolution | [1,192,7,7] | 936.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
540 | densenet1_stage4_concat20 | Concat | [1,2016,7,7] | 33.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.43 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
540 | densenet1_stage4_concat20 | Concat | [1,2016,7,7] | 33.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
541 | densenet1_stage4_batchnorm42_fwd | BatchNorm | [1,2064,7,7] | 32 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
542 | densenet1_stage4_relu42_fwd | Activation | [1,2064,7,7] | 16.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
543 | densenet1_stage4_conv42_fwd | Convolution | [1,2064,7,7] | 3263 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 199.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
543 | densenet1_stage4_conv42_fwd | Convolution | [1,2064,7,7] | 3263 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
544 | densenet1_stage4_batchnorm43_fwd | BatchNorm | [1,192,7,7] | 75.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
545 | densenet1_stage4_relu43_fwd | Activation | [1,192,7,7] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
546 | densenet1_stage4_conv43_fwd | Convolution | [1,192,7,7] | 949.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
546 | densenet1_stage4_conv43_fwd | Convolution | [1,192,7,7] | 949.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
547 | densenet1_stage4_concat21 | Concat | [1,2064,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
547 | densenet1_stage4_concat21 | Concat | [1,2064,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
548 | densenet1_stage4_batchnorm44_fwd | BatchNorm | [1,2112,7,7] | 29.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
549 | densenet1_stage4_relu44_fwd | Activation | [1,2112,7,7] | 16.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
550 | densenet1_stage4_conv44_fwd | Convolution | [1,2112,7,7] | 3323.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 204.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
550 | densenet1_stage4_conv44_fwd | Convolution | [1,2112,7,7] | 3323.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
551 | densenet1_stage4_batchnorm45_fwd | BatchNorm | [1,192,7,7] | 83 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
552 | densenet1_stage4_relu45_fwd | Activation | [1,192,7,7] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
553 | densenet1_stage4_conv45_fwd | Convolution | [1,192,7,7] | 943 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
553 | densenet1_stage4_conv45_fwd | Convolution | [1,192,7,7] | 943 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
554 | densenet1_stage4_concat22 | Concat | [1,2112,7,7] | 32.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
554 | densenet1_stage4_concat22 | Concat | [1,2112,7,7] | 32.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
555 | densenet1_stage4_batchnorm46_fwd | BatchNorm | [1,2160,7,7] | 30.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
556 | densenet1_stage4_relu46_fwd | Activation | [1,2160,7,7] | 16.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
557 | densenet1_stage4_conv46_fwd | Convolution | [1,2160,7,7] | 3410.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 212.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
557 | densenet1_stage4_conv46_fwd | Convolution | [1,2160,7,7] | 3410.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 132.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
558 | densenet1_stage4_batchnorm47_fwd | BatchNorm | [1,192,7,7] | 79.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
559 | densenet1_stage4_relu47_fwd | Activation | [1,192,7,7] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
560 | densenet1_stage4_conv47_fwd | Convolution | [1,192,7,7] | 941 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | maxwell_scudnn_winograd_128x128_ldg1_ldg4_tile148n_nt | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
560 | densenet1_stage4_conv47_fwd | Convolution | [1,192,7,7] | 941 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
561 | densenet1_stage4_concat23 | Concat | [1,2160,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
561 | densenet1_stage4_concat23 | Concat | [1,2160,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
562 | densenet1_batchnorm4_fwd | BatchNorm | [1,2208,7,7] | 30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
563 | densenet1_relu4_fwd | Activation | [1,2208,7,7] | 17.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
564 | densenet1_pool4_fwd | Pooling | [1,2208,7,7] | 208.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 17.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
566 | densenet1_dense0_fwd | FullyConnected | [1,2208] | 2856 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void gemv2T_kernel_val<int, int, float, float, float, 128, 16, 2, 2, false, cublasGemvParams<cublasGemvTensor<float const>, cublasGemvTensor<float>, float> >(cublasGemvParams<cublasGemvTensor<float const>, cublasGemvTensor<float>, float>, float, float) | 79.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
566 | densenet1_dense0_fwd | FullyConnected | [1,2208] | 2856 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
Showing 1 to 805 of 805 entries