GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | resnetv24_batchnorm0_fwd | BatchNorm | [1,3,224,224] | 185.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 23.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
0 | resnetv24_batchnorm0_fwd | BatchNorm | [1,3,224,224] | 185.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
1 | resnetv24_conv0_fwd | Convolution | [1,3,224,224] | 20242.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_medium_nn_v1 | 42.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
1 | resnetv24_conv0_fwd | Convolution | [1,3,224,224] | 20242.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
2 | resnetv24_batchnorm1_fwd | BatchNorm | [1,64,112,112] | 916.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
3 | resnetv24_relu0_fwd | Activation | [1,64,112,112] | 207.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
4 | resnetv24_pool0_fwd | Pooling | [1,64,112,112] | 2844 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 8.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
5 | resnetv24_stage1_batchnorm0_fwd | BatchNorm | [1,64,56,56] | 238 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
6 | resnetv24_stage1_activation0 | Activation | [1,64,56,56] | 59.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
7 | resnetv24_stage1_conv0_fwd | Convolution | [1,64,56,56] | 2070.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 11.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
8 | resnetv24_stage1_batchnorm1_fwd | BatchNorm | [1,64,56,56] | 43 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
9 | resnetv24_stage1_activation1 | Activation | [1,64,56,56] | 59 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
10 | resnetv24_stage1_conv1_fwd | Convolution | [1,64,56,56] | 14897.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 25.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
10 | resnetv24_stage1_conv1_fwd | Convolution | [1,64,56,56] | 14897.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
11 | resnetv24_stage1_batchnorm2_fwd | BatchNorm | [1,64,56,56] | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
12 | resnetv24_stage1_activation2 | Activation | [1,64,56,56] | 41.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
13 | resnetv24_stage1_conv2_fwd | Convolution | [1,64,56,56] | 5991.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
13 | resnetv24_stage1_conv2_fwd | Convolution | [1,64,56,56] | 5991.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
14 | resnetv24_stage1_conv3_fwd | Convolution | [1,64,56,56] | 5079.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 21.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
14 | resnetv24_stage1_conv3_fwd | Convolution | [1,64,56,56] | 5079.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
15 | resnetv24_stage1__plus0 | elemwise_add | [1,256,56,56] | 210 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
16 | resnetv24_stage1_batchnorm3_fwd | BatchNorm | [1,256,56,56] | 95 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
17 | resnetv24_stage1_activation3 | Activation | [1,256,56,56] | 166.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
18 | resnetv24_stage1_conv4_fwd | Convolution | [1,256,56,56] | 6211 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 28.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
19 | resnetv24_stage1_batchnorm4_fwd | BatchNorm | [1,64,56,56] | 105 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
20 | resnetv24_stage1_activation4 | Activation | [1,64,56,56] | 56.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
21 | resnetv24_stage1_conv5_fwd | Convolution | [1,64,56,56] | 14360.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 25.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
21 | resnetv24_stage1_conv5_fwd | Convolution | [1,64,56,56] | 14360.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
22 | resnetv24_stage1_batchnorm5_fwd | BatchNorm | [1,64,56,56] | 104 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
23 | resnetv24_stage1_activation5 | Activation | [1,64,56,56] | 53.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
24 | resnetv24_stage1_conv6_fwd | Convolution | [1,64,56,56] | 5948.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
24 | resnetv24_stage1_conv6_fwd | Convolution | [1,64,56,56] | 5948.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
25 | resnetv24_stage1__plus1 | elemwise_add | [1,256,56,56] | 243 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 11.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
26 | resnetv24_stage1_batchnorm6_fwd | BatchNorm | [1,256,56,56] | 119.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
27 | resnetv24_stage1_activation6 | Activation | [1,256,56,56] | 214.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
28 | resnetv24_stage1_conv7_fwd | Convolution | [1,256,56,56] | 6280.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 28.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
29 | resnetv24_stage1_batchnorm7_fwd | BatchNorm | [1,64,56,56] | 106.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
30 | resnetv24_stage1_activation7 | Activation | [1,64,56,56] | 56 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
31 | resnetv24_stage1_conv8_fwd | Convolution | [1,64,56,56] | 14242 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 26.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
31 | resnetv24_stage1_conv8_fwd | Convolution | [1,64,56,56] | 14242 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
32 | resnetv24_stage1_batchnorm8_fwd | BatchNorm | [1,64,56,56] | 93.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
33 | resnetv24_stage1_activation8 | Activation | [1,64,56,56] | 40.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
34 | resnetv24_stage1_conv9_fwd | Convolution | [1,64,56,56] | 5717 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
34 | resnetv24_stage1_conv9_fwd | Convolution | [1,64,56,56] | 5717 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
35 | resnetv24_stage1__plus2 | elemwise_add | [1,256,56,56] | 214.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 11.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
36 | resnetv24_stage2_batchnorm0_fwd | BatchNorm | [1,256,56,56] | 107.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 10.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
37 | resnetv24_stage2_activation0 | Activation | [1,256,56,56] | 189.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
38 | resnetv24_stage2_conv0_fwd | Convolution | [1,256,56,56] | 10987 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 35.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
38 | resnetv24_stage2_conv0_fwd | Convolution | [1,256,56,56] | 10987 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
39 | resnetv24_stage2_batchnorm1_fwd | BatchNorm | [1,128,56,56] | 117 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
40 | resnetv24_stage2_activation1 | Activation | [1,128,56,56] | 87.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
41 | resnetv24_stage2_conv1_fwd | Convolution | [1,128,56,56] | 12921.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 97.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
42 | resnetv24_stage2_batchnorm2_fwd | BatchNorm | [1,128,28,28] | 97.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
43 | resnetv24_stage2_activation2 | Activation | [1,128,28,28] | 28.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
44 | resnetv24_stage2_conv2_fwd | Convolution | [1,128,28,28] | 5879.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
44 | resnetv24_stage2_conv2_fwd | Convolution | [1,128,28,28] | 5879.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
45 | resnetv24_stage2_conv3_fwd | Convolution | [1,256,56,56] | 9777.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 38.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
45 | resnetv24_stage2_conv3_fwd | Convolution | [1,256,56,56] | 9777.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
46 | resnetv24_stage2__plus0 | elemwise_add | [1,512,28,28] | 154.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
47 | resnetv24_stage2_batchnorm3_fwd | BatchNorm | [1,512,28,28] | 59.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
48 | resnetv24_stage2_activation3 | Activation | [1,512,28,28] | 102.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
49 | resnetv24_stage2_conv4_fwd | Convolution | [1,512,28,28] | 6120.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 44.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
50 | resnetv24_stage2_batchnorm4_fwd | BatchNorm | [1,128,28,28] | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
51 | resnetv24_stage2_activation4 | Activation | [1,128,28,28] | 33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
52 | resnetv24_stage2_conv5_fwd | Convolution | [1,128,28,28] | 13190.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
52 | resnetv24_stage2_conv5_fwd | Convolution | [1,128,28,28] | 13190.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
53 | resnetv24_stage2_batchnorm5_fwd | BatchNorm | [1,128,28,28] | 94.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
54 | resnetv24_stage2_activation5 | Activation | [1,128,28,28] | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
55 | resnetv24_stage2_conv6_fwd | Convolution | [1,128,28,28] | 5962.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
55 | resnetv24_stage2_conv6_fwd | Convolution | [1,128,28,28] | 5962.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
56 | resnetv24_stage2__plus1 | elemwise_add | [1,512,28,28] | 147.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 5.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
57 | resnetv24_stage2_batchnorm6_fwd | BatchNorm | [1,512,28,28] | 55 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
58 | resnetv24_stage2_activation6 | Activation | [1,512,28,28] | 83 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
59 | resnetv24_stage2_conv7_fwd | Convolution | [1,512,28,28] | 6149 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 45.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
60 | resnetv24_stage2_batchnorm7_fwd | BatchNorm | [1,128,28,28] | 98.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
61 | resnetv24_stage2_activation7 | Activation | [1,128,28,28] | 32.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
62 | resnetv24_stage2_conv8_fwd | Convolution | [1,128,28,28] | 12921 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
62 | resnetv24_stage2_conv8_fwd | Convolution | [1,128,28,28] | 12921 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
63 | resnetv24_stage2_batchnorm8_fwd | BatchNorm | [1,128,28,28] | 93.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
64 | resnetv24_stage2_activation8 | Activation | [1,128,28,28] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
65 | resnetv24_stage2_conv9_fwd | Convolution | [1,128,28,28] | 5671.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
65 | resnetv24_stage2_conv9_fwd | Convolution | [1,128,28,28] | 5671.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
66 | resnetv24_stage2__plus2 | elemwise_add | [1,512,28,28] | 147.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
67 | resnetv24_stage2_batchnorm9_fwd | BatchNorm | [1,512,28,28] | 53 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
68 | resnetv24_stage2_activation9 | Activation | [1,512,28,28] | 80.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
69 | resnetv24_stage2_conv10_fwd | Convolution | [1,512,28,28] | 6000.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 45.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
70 | resnetv24_stage2_batchnorm10_fwd | BatchNorm | [1,128,28,28] | 92.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
71 | resnetv24_stage2_activation10 | Activation | [1,128,28,28] | 23.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
72 | resnetv24_stage2_conv11_fwd | Convolution | [1,128,28,28] | 13021.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
72 | resnetv24_stage2_conv11_fwd | Convolution | [1,128,28,28] | 13021.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
73 | resnetv24_stage2_batchnorm11_fwd | BatchNorm | [1,128,28,28] | 93.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
74 | resnetv24_stage2_activation11 | Activation | [1,128,28,28] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
75 | resnetv24_stage2_conv12_fwd | Convolution | [1,128,28,28] | 5788.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
75 | resnetv24_stage2_conv12_fwd | Convolution | [1,128,28,28] | 5788.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
76 | resnetv24_stage2__plus3 | elemwise_add | [1,512,28,28] | 154.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 5.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
77 | resnetv24_stage2_batchnorm12_fwd | BatchNorm | [1,512,28,28] | 54.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
78 | resnetv24_stage2_activation12 | Activation | [1,512,28,28] | 81.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
79 | resnetv24_stage2_conv13_fwd | Convolution | [1,512,28,28] | 6028 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 45.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
80 | resnetv24_stage2_batchnorm13_fwd | BatchNorm | [1,128,28,28] | 93 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
81 | resnetv24_stage2_activation13 | Activation | [1,128,28,28] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
82 | resnetv24_stage2_conv14_fwd | Convolution | [1,128,28,28] | 12862.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
82 | resnetv24_stage2_conv14_fwd | Convolution | [1,128,28,28] | 12862.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
83 | resnetv24_stage2_batchnorm14_fwd | BatchNorm | [1,128,28,28] | 83.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
84 | resnetv24_stage2_activation14 | Activation | [1,128,28,28] | 21.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
85 | resnetv24_stage2_conv15_fwd | Convolution | [1,128,28,28] | 5754.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
85 | resnetv24_stage2_conv15_fwd | Convolution | [1,128,28,28] | 5754.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
86 | resnetv24_stage2__plus4 | elemwise_add | [1,512,28,28] | 141.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 5.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
87 | resnetv24_stage2_batchnorm15_fwd | BatchNorm | [1,512,28,28] | 53.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
88 | resnetv24_stage2_activation15 | Activation | [1,512,28,28] | 80.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
89 | resnetv24_stage2_conv16_fwd | Convolution | [1,512,28,28] | 6130.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 45.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
90 | resnetv24_stage2_batchnorm16_fwd | BatchNorm | [1,128,28,28] | 99.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
91 | resnetv24_stage2_activation16 | Activation | [1,128,28,28] | 23.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
92 | resnetv24_stage2_conv17_fwd | Convolution | [1,128,28,28] | 13316.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
92 | resnetv24_stage2_conv17_fwd | Convolution | [1,128,28,28] | 13316.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
93 | resnetv24_stage2_batchnorm17_fwd | BatchNorm | [1,128,28,28] | 85.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
94 | resnetv24_stage2_activation17 | Activation | [1,128,28,28] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
95 | resnetv24_stage2_conv18_fwd | Convolution | [1,128,28,28] | 5829 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
95 | resnetv24_stage2_conv18_fwd | Convolution | [1,128,28,28] | 5829 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
96 | resnetv24_stage2__plus5 | elemwise_add | [1,512,28,28] | 146.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 5.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
97 | resnetv24_stage2_batchnorm18_fwd | BatchNorm | [1,512,28,28] | 55 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
98 | resnetv24_stage2_activation18 | Activation | [1,512,28,28] | 86.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
99 | resnetv24_stage2_conv19_fwd | Convolution | [1,512,28,28] | 6109.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 45.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
100 | resnetv24_stage2_batchnorm19_fwd | BatchNorm | [1,128,28,28] | 96.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
101 | resnetv24_stage2_activation19 | Activation | [1,128,28,28] | 20.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
102 | resnetv24_stage2_conv20_fwd | Convolution | [1,128,28,28] | 12859 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
102 | resnetv24_stage2_conv20_fwd | Convolution | [1,128,28,28] | 12859 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
103 | resnetv24_stage2_batchnorm20_fwd | BatchNorm | [1,128,28,28] | 89 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
104 | resnetv24_stage2_activation20 | Activation | [1,128,28,28] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
105 | resnetv24_stage2_conv21_fwd | Convolution | [1,128,28,28] | 5743.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
105 | resnetv24_stage2_conv21_fwd | Convolution | [1,128,28,28] | 5743.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
106 | resnetv24_stage2__plus6 | elemwise_add | [1,512,28,28] | 145 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 5.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
107 | resnetv24_stage2_batchnorm21_fwd | BatchNorm | [1,512,28,28] | 51 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
108 | resnetv24_stage2_activation21 | Activation | [1,512,28,28] | 69.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
109 | resnetv24_stage2_conv22_fwd | Convolution | [1,512,28,28] | 6001.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 45.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
110 | resnetv24_stage2_batchnorm22_fwd | BatchNorm | [1,128,28,28] | 83.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
111 | resnetv24_stage2_activation22 | Activation | [1,128,28,28] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
112 | resnetv24_stage2_conv23_fwd | Convolution | [1,128,28,28] | 12766.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
112 | resnetv24_stage2_conv23_fwd | Convolution | [1,128,28,28] | 12766.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
113 | resnetv24_stage2_batchnorm23_fwd | BatchNorm | [1,128,28,28] | 88.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
114 | resnetv24_stage2_activation23 | Activation | [1,128,28,28] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
115 | resnetv24_stage2_conv24_fwd | Convolution | [1,128,28,28] | 5633.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 22.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
115 | resnetv24_stage2_conv24_fwd | Convolution | [1,128,28,28] | 5633.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
116 | resnetv24_stage2__plus7 | elemwise_add | [1,512,28,28] | 144.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 5.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
117 | resnetv24_stage3_batchnorm0_fwd | BatchNorm | [1,512,28,28] | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
118 | resnetv24_stage3_activation0 | Activation | [1,512,28,28] | 64.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
119 | resnetv24_stage3_conv0_fwd | Convolution | [1,512,28,28] | 10447.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 50.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
120 | resnetv24_stage3_batchnorm1_fwd | BatchNorm | [1,256,28,28] | 103.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
121 | resnetv24_stage3_activation1 | Activation | [1,256,28,28] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
122 | resnetv24_stage3_conv1_fwd | Convolution | [1,256,28,28] | 12475.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 130.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
123 | resnetv24_stage3_batchnorm2_fwd | BatchNorm | [1,256,14,14] | 84.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
124 | resnetv24_stage3_activation2 | Activation | [1,256,14,14] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
125 | resnetv24_stage3_conv2_fwd | Convolution | [1,256,14,14] | 5709 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
126 | resnetv24_stage3_conv3_fwd | Convolution | [1,512,28,28] | 9869.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 65.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
126 | resnetv24_stage3_conv3_fwd | Convolution | [1,512,28,28] | 9869.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
127 | resnetv24_stage3__plus0 | elemwise_add | [1,1024,14,14] | 121.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
128 | resnetv24_stage3_batchnorm3_fwd | BatchNorm | [1,1024,14,14] | 39 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
129 | resnetv24_stage3_activation3 | Activation | [1,1024,14,14] | 42.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
130 | resnetv24_stage3_conv4_fwd | Convolution | [1,1024,14,14] | 5981.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
131 | resnetv24_stage3_batchnorm4_fwd | BatchNorm | [1,256,14,14] | 99 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
132 | resnetv24_stage3_activation4 | Activation | [1,256,14,14] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
133 | resnetv24_stage3_conv5_fwd | Convolution | [1,256,14,14] | 12638.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
133 | resnetv24_stage3_conv5_fwd | Convolution | [1,256,14,14] | 12638.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
134 | resnetv24_stage3_batchnorm5_fwd | BatchNorm | [1,256,14,14] | 107.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
135 | resnetv24_stage3_activation5 | Activation | [1,256,14,14] | 14.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
136 | resnetv24_stage3_conv6_fwd | Convolution | [1,256,14,14] | 5756.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 31.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
137 | resnetv24_stage3__plus1 | elemwise_add | [1,1024,14,14] | 101 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
138 | resnetv24_stage3_batchnorm6_fwd | BatchNorm | [1,1024,14,14] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
139 | resnetv24_stage3_activation6 | Activation | [1,1024,14,14] | 37.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
140 | resnetv24_stage3_conv7_fwd | Convolution | [1,1024,14,14] | 5801 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
141 | resnetv24_stage3_batchnorm7_fwd | BatchNorm | [1,256,14,14] | 84.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
142 | resnetv24_stage3_activation7 | Activation | [1,256,14,14] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
143 | resnetv24_stage3_conv8_fwd | Convolution | [1,256,14,14] | 12355.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
143 | resnetv24_stage3_conv8_fwd | Convolution | [1,256,14,14] | 12355.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
144 | resnetv24_stage3_batchnorm8_fwd | BatchNorm | [1,256,14,14] | 74.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
145 | resnetv24_stage3_activation8 | Activation | [1,256,14,14] | 10.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
146 | resnetv24_stage3_conv9_fwd | Convolution | [1,256,14,14] | 5463.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
147 | resnetv24_stage3__plus2 | elemwise_add | [1,1024,14,14] | 95.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
148 | resnetv24_stage3_batchnorm9_fwd | BatchNorm | [1,1024,14,14] | 37.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
149 | resnetv24_stage3_activation9 | Activation | [1,1024,14,14] | 41 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
150 | resnetv24_stage3_conv10_fwd | Convolution | [1,1024,14,14] | 5776.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
151 | resnetv24_stage3_batchnorm10_fwd | BatchNorm | [1,256,14,14] | 87 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
152 | resnetv24_stage3_activation10 | Activation | [1,256,14,14] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
153 | resnetv24_stage3_conv11_fwd | Convolution | [1,256,14,14] | 12386 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
153 | resnetv24_stage3_conv11_fwd | Convolution | [1,256,14,14] | 12386 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
154 | resnetv24_stage3_batchnorm11_fwd | BatchNorm | [1,256,14,14] | 83.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
155 | resnetv24_stage3_activation11 | Activation | [1,256,14,14] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
156 | resnetv24_stage3_conv12_fwd | Convolution | [1,256,14,14] | 5540 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
157 | resnetv24_stage3__plus3 | elemwise_add | [1,1024,14,14] | 105.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
158 | resnetv24_stage3_batchnorm12_fwd | BatchNorm | [1,1024,14,14] | 40.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
159 | resnetv24_stage3_activation12 | Activation | [1,1024,14,14] | 41 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
160 | resnetv24_stage3_conv13_fwd | Convolution | [1,1024,14,14] | 5818.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
161 | resnetv24_stage3_batchnorm13_fwd | BatchNorm | [1,256,14,14] | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
162 | resnetv24_stage3_activation13 | Activation | [1,256,14,14] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
163 | resnetv24_stage3_conv14_fwd | Convolution | [1,256,14,14] | 12452.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
163 | resnetv24_stage3_conv14_fwd | Convolution | [1,256,14,14] | 12452.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
164 | resnetv24_stage3_batchnorm14_fwd | BatchNorm | [1,256,14,14] | 99.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
165 | resnetv24_stage3_activation14 | Activation | [1,256,14,14] | 12.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
166 | resnetv24_stage3_conv15_fwd | Convolution | [1,256,14,14] | 5519.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
167 | resnetv24_stage3__plus4 | elemwise_add | [1,1024,14,14] | 114 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
168 | resnetv24_stage3_batchnorm15_fwd | BatchNorm | [1,1024,14,14] | 35.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
169 | resnetv24_stage3_activation15 | Activation | [1,1024,14,14] | 38.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
170 | resnetv24_stage3_conv16_fwd | Convolution | [1,1024,14,14] | 5971.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
171 | resnetv24_stage3_batchnorm16_fwd | BatchNorm | [1,256,14,14] | 102.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
172 | resnetv24_stage3_activation16 | Activation | [1,256,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
173 | resnetv24_stage3_conv17_fwd | Convolution | [1,256,14,14] | 12667.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
173 | resnetv24_stage3_conv17_fwd | Convolution | [1,256,14,14] | 12667.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
174 | resnetv24_stage3_batchnorm17_fwd | BatchNorm | [1,256,14,14] | 90.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
175 | resnetv24_stage3_activation17 | Activation | [1,256,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
176 | resnetv24_stage3_conv18_fwd | Convolution | [1,256,14,14] | 5517 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
177 | resnetv24_stage3__plus5 | elemwise_add | [1,1024,14,14] | 113.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
178 | resnetv24_stage3_batchnorm18_fwd | BatchNorm | [1,1024,14,14] | 38 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
179 | resnetv24_stage3_activation18 | Activation | [1,1024,14,14] | 41 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
180 | resnetv24_stage3_conv19_fwd | Convolution | [1,1024,14,14] | 5815 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
181 | resnetv24_stage3_batchnorm19_fwd | BatchNorm | [1,256,14,14] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
182 | resnetv24_stage3_activation19 | Activation | [1,256,14,14] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
183 | resnetv24_stage3_conv20_fwd | Convolution | [1,256,14,14] | 12408.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
183 | resnetv24_stage3_conv20_fwd | Convolution | [1,256,14,14] | 12408.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
184 | resnetv24_stage3_batchnorm20_fwd | BatchNorm | [1,256,14,14] | 100.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
185 | resnetv24_stage3_activation20 | Activation | [1,256,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
186 | resnetv24_stage3_conv21_fwd | Convolution | [1,256,14,14] | 5585.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
187 | resnetv24_stage3__plus6 | elemwise_add | [1,1024,14,14] | 113.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
188 | resnetv24_stage3_batchnorm21_fwd | BatchNorm | [1,1024,14,14] | 41 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
189 | resnetv24_stage3_activation21 | Activation | [1,1024,14,14] | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
190 | resnetv24_stage3_conv22_fwd | Convolution | [1,1024,14,14] | 5836 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
191 | resnetv24_stage3_batchnorm22_fwd | BatchNorm | [1,256,14,14] | 86.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
192 | resnetv24_stage3_activation22 | Activation | [1,256,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
193 | resnetv24_stage3_conv23_fwd | Convolution | [1,256,14,14] | 12367 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
193 | resnetv24_stage3_conv23_fwd | Convolution | [1,256,14,14] | 12367 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
194 | resnetv24_stage3_batchnorm23_fwd | BatchNorm | [1,256,14,14] | 85.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
195 | resnetv24_stage3_activation23 | Activation | [1,256,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
196 | resnetv24_stage3_conv24_fwd | Convolution | [1,256,14,14] | 5529.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
197 | resnetv24_stage3__plus7 | elemwise_add | [1,1024,14,14] | 103.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
198 | resnetv24_stage3_batchnorm24_fwd | BatchNorm | [1,1024,14,14] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
199 | resnetv24_stage3_activation24 | Activation | [1,1024,14,14] | 34.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
200 | resnetv24_stage3_conv25_fwd | Convolution | [1,1024,14,14] | 5824.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
201 | resnetv24_stage3_batchnorm25_fwd | BatchNorm | [1,256,14,14] | 81.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
202 | resnetv24_stage3_activation25 | Activation | [1,256,14,14] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
203 | resnetv24_stage3_conv26_fwd | Convolution | [1,256,14,14] | 12431.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
203 | resnetv24_stage3_conv26_fwd | Convolution | [1,256,14,14] | 12431.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
204 | resnetv24_stage3_batchnorm26_fwd | BatchNorm | [1,256,14,14] | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
205 | resnetv24_stage3_activation26 | Activation | [1,256,14,14] | 10.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
206 | resnetv24_stage3_conv27_fwd | Convolution | [1,256,14,14] | 5576.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
207 | resnetv24_stage3__plus8 | elemwise_add | [1,1024,14,14] | 117 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
208 | resnetv24_stage3_batchnorm27_fwd | BatchNorm | [1,1024,14,14] | 37.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
209 | resnetv24_stage3_activation27 | Activation | [1,1024,14,14] | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
210 | resnetv24_stage3_conv28_fwd | Convolution | [1,1024,14,14] | 5841.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
211 | resnetv24_stage3_batchnorm28_fwd | BatchNorm | [1,256,14,14] | 80.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
212 | resnetv24_stage3_activation28 | Activation | [1,256,14,14] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
213 | resnetv24_stage3_conv29_fwd | Convolution | [1,256,14,14] | 12399 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
213 | resnetv24_stage3_conv29_fwd | Convolution | [1,256,14,14] | 12399 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
214 | resnetv24_stage3_batchnorm29_fwd | BatchNorm | [1,256,14,14] | 85.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
215 | resnetv24_stage3_activation29 | Activation | [1,256,14,14] | 10.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
216 | resnetv24_stage3_conv30_fwd | Convolution | [1,256,14,14] | 5542.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
217 | resnetv24_stage3__plus9 | elemwise_add | [1,1024,14,14] | 98.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
218 | resnetv24_stage3_batchnorm30_fwd | BatchNorm | [1,1024,14,14] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
219 | resnetv24_stage3_activation30 | Activation | [1,1024,14,14] | 34.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
220 | resnetv24_stage3_conv31_fwd | Convolution | [1,1024,14,14] | 5827.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
221 | resnetv24_stage3_batchnorm31_fwd | BatchNorm | [1,256,14,14] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
222 | resnetv24_stage3_activation31 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
223 | resnetv24_stage3_conv32_fwd | Convolution | [1,256,14,14] | 12305 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
223 | resnetv24_stage3_conv32_fwd | Convolution | [1,256,14,14] | 12305 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
224 | resnetv24_stage3_batchnorm32_fwd | BatchNorm | [1,256,14,14] | 75.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
225 | resnetv24_stage3_activation32 | Activation | [1,256,14,14] | 10.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
226 | resnetv24_stage3_conv33_fwd | Convolution | [1,256,14,14] | 5496.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
227 | resnetv24_stage3__plus10 | elemwise_add | [1,1024,14,14] | 92 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
228 | resnetv24_stage3_batchnorm33_fwd | BatchNorm | [1,1024,14,14] | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
229 | resnetv24_stage3_activation33 | Activation | [1,1024,14,14] | 33.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
230 | resnetv24_stage3_conv34_fwd | Convolution | [1,1024,14,14] | 5792.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
231 | resnetv24_stage3_batchnorm34_fwd | BatchNorm | [1,256,14,14] | 80.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
232 | resnetv24_stage3_activation34 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
233 | resnetv24_stage3_conv35_fwd | Convolution | [1,256,14,14] | 12323.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
233 | resnetv24_stage3_conv35_fwd | Convolution | [1,256,14,14] | 12323.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
234 | resnetv24_stage3_batchnorm35_fwd | BatchNorm | [1,256,14,14] | 83 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
235 | resnetv24_stage3_activation35 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
236 | resnetv24_stage3_conv36_fwd | Convolution | [1,256,14,14] | 5504 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 31.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
237 | resnetv24_stage3__plus11 | elemwise_add | [1,1024,14,14] | 103 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
238 | resnetv24_stage3_batchnorm36_fwd | BatchNorm | [1,1024,14,14] | 36.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
239 | resnetv24_stage3_activation36 | Activation | [1,1024,14,14] | 35.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
240 | resnetv24_stage3_conv37_fwd | Convolution | [1,1024,14,14] | 5801 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
241 | resnetv24_stage3_batchnorm37_fwd | BatchNorm | [1,256,14,14] | 91.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
242 | resnetv24_stage3_activation37 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
243 | resnetv24_stage3_conv38_fwd | Convolution | [1,256,14,14] | 12312.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
243 | resnetv24_stage3_conv38_fwd | Convolution | [1,256,14,14] | 12312.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
244 | resnetv24_stage3_batchnorm38_fwd | BatchNorm | [1,256,14,14] | 79 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
245 | resnetv24_stage3_activation38 | Activation | [1,256,14,14] | 10.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
246 | resnetv24_stage3_conv39_fwd | Convolution | [1,256,14,14] | 5541 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
247 | resnetv24_stage3__plus12 | elemwise_add | [1,1024,14,14] | 104.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
248 | resnetv24_stage3_batchnorm39_fwd | BatchNorm | [1,1024,14,14] | 37.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
249 | resnetv24_stage3_activation39 | Activation | [1,1024,14,14] | 36.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
250 | resnetv24_stage3_conv40_fwd | Convolution | [1,1024,14,14] | 5938 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
251 | resnetv24_stage3_batchnorm40_fwd | BatchNorm | [1,256,14,14] | 85.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
252 | resnetv24_stage3_activation40 | Activation | [1,256,14,14] | 16.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
253 | resnetv24_stage3_conv41_fwd | Convolution | [1,256,14,14] | 12349.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
253 | resnetv24_stage3_conv41_fwd | Convolution | [1,256,14,14] | 12349.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
254 | resnetv24_stage3_batchnorm41_fwd | BatchNorm | [1,256,14,14] | 81 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
255 | resnetv24_stage3_activation41 | Activation | [1,256,14,14] | 10 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
256 | resnetv24_stage3_conv42_fwd | Convolution | [1,256,14,14] | 5565.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 31.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
257 | resnetv24_stage3__plus13 | elemwise_add | [1,1024,14,14] | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
258 | resnetv24_stage3_batchnorm42_fwd | BatchNorm | [1,1024,14,14] | 37.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
259 | resnetv24_stage3_activation42 | Activation | [1,1024,14,14] | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
260 | resnetv24_stage3_conv43_fwd | Convolution | [1,1024,14,14] | 5839.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
261 | resnetv24_stage3_batchnorm43_fwd | BatchNorm | [1,256,14,14] | 81 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
262 | resnetv24_stage3_activation43 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
263 | resnetv24_stage3_conv44_fwd | Convolution | [1,256,14,14] | 12350.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
263 | resnetv24_stage3_conv44_fwd | Convolution | [1,256,14,14] | 12350.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
264 | resnetv24_stage3_batchnorm44_fwd | BatchNorm | [1,256,14,14] | 90.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
265 | resnetv24_stage3_activation44 | Activation | [1,256,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
266 | resnetv24_stage3_conv45_fwd | Convolution | [1,256,14,14] | 5465 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
267 | resnetv24_stage3__plus14 | elemwise_add | [1,1024,14,14] | 103.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
268 | resnetv24_stage3_batchnorm45_fwd | BatchNorm | [1,1024,14,14] | 38 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
269 | resnetv24_stage3_activation45 | Activation | [1,1024,14,14] | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
270 | resnetv24_stage3_conv46_fwd | Convolution | [1,1024,14,14] | 5808.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
271 | resnetv24_stage3_batchnorm46_fwd | BatchNorm | [1,256,14,14] | 84.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
272 | resnetv24_stage3_activation46 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
273 | resnetv24_stage3_conv47_fwd | Convolution | [1,256,14,14] | 12340 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
273 | resnetv24_stage3_conv47_fwd | Convolution | [1,256,14,14] | 12340 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
274 | resnetv24_stage3_batchnorm47_fwd | BatchNorm | [1,256,14,14] | 80.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
275 | resnetv24_stage3_activation47 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
276 | resnetv24_stage3_conv48_fwd | Convolution | [1,256,14,14] | 5491.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 31.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
277 | resnetv24_stage3__plus15 | elemwise_add | [1,1024,14,14] | 96.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
278 | resnetv24_stage3_batchnorm48_fwd | BatchNorm | [1,1024,14,14] | 36.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
279 | resnetv24_stage3_activation48 | Activation | [1,1024,14,14] | 34.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
280 | resnetv24_stage3_conv49_fwd | Convolution | [1,1024,14,14] | 5810 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
281 | resnetv24_stage3_batchnorm49_fwd | BatchNorm | [1,256,14,14] | 79 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
282 | resnetv24_stage3_activation49 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
283 | resnetv24_stage3_conv50_fwd | Convolution | [1,256,14,14] | 12341.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
283 | resnetv24_stage3_conv50_fwd | Convolution | [1,256,14,14] | 12341.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
284 | resnetv24_stage3_batchnorm50_fwd | BatchNorm | [1,256,14,14] | 79.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
285 | resnetv24_stage3_activation50 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
286 | resnetv24_stage3_conv51_fwd | Convolution | [1,256,14,14] | 5491 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
287 | resnetv24_stage3__plus16 | elemwise_add | [1,1024,14,14] | 97 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
288 | resnetv24_stage3_batchnorm51_fwd | BatchNorm | [1,1024,14,14] | 37.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
289 | resnetv24_stage3_activation51 | Activation | [1,1024,14,14] | 34.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
290 | resnetv24_stage3_conv52_fwd | Convolution | [1,1024,14,14] | 5814 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
291 | resnetv24_stage3_batchnorm52_fwd | BatchNorm | [1,256,14,14] | 77.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
292 | resnetv24_stage3_activation52 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
293 | resnetv24_stage3_conv53_fwd | Convolution | [1,256,14,14] | 12520 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
293 | resnetv24_stage3_conv53_fwd | Convolution | [1,256,14,14] | 12520 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
294 | resnetv24_stage3_batchnorm53_fwd | BatchNorm | [1,256,14,14] | 77.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
295 | resnetv24_stage3_activation53 | Activation | [1,256,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
296 | resnetv24_stage3_conv54_fwd | Convolution | [1,256,14,14] | 5504.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
297 | resnetv24_stage3__plus17 | elemwise_add | [1,1024,14,14] | 101 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
298 | resnetv24_stage3_batchnorm54_fwd | BatchNorm | [1,1024,14,14] | 35.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
299 | resnetv24_stage3_activation54 | Activation | [1,1024,14,14] | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
300 | resnetv24_stage3_conv55_fwd | Convolution | [1,1024,14,14] | 5795.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
301 | resnetv24_stage3_batchnorm55_fwd | BatchNorm | [1,256,14,14] | 79.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
302 | resnetv24_stage3_activation55 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
303 | resnetv24_stage3_conv56_fwd | Convolution | [1,256,14,14] | 12312.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
303 | resnetv24_stage3_conv56_fwd | Convolution | [1,256,14,14] | 12312.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
304 | resnetv24_stage3_batchnorm56_fwd | BatchNorm | [1,256,14,14] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
305 | resnetv24_stage3_activation56 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
306 | resnetv24_stage3_conv57_fwd | Convolution | [1,256,14,14] | 5503.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 31.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
307 | resnetv24_stage3__plus18 | elemwise_add | [1,1024,14,14] | 108.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
308 | resnetv24_stage3_batchnorm57_fwd | BatchNorm | [1,1024,14,14] | 35.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
309 | resnetv24_stage3_activation57 | Activation | [1,1024,14,14] | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
310 | resnetv24_stage3_conv58_fwd | Convolution | [1,1024,14,14] | 5763 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
311 | resnetv24_stage3_batchnorm58_fwd | BatchNorm | [1,256,14,14] | 80.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
312 | resnetv24_stage3_activation58 | Activation | [1,256,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
313 | resnetv24_stage3_conv59_fwd | Convolution | [1,256,14,14] | 12309 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
313 | resnetv24_stage3_conv59_fwd | Convolution | [1,256,14,14] | 12309 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
314 | resnetv24_stage3_batchnorm59_fwd | BatchNorm | [1,256,14,14] | 80.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
315 | resnetv24_stage3_activation59 | Activation | [1,256,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
316 | resnetv24_stage3_conv60_fwd | Convolution | [1,256,14,14] | 5508 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
317 | resnetv24_stage3__plus19 | elemwise_add | [1,1024,14,14] | 99.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
318 | resnetv24_stage3_batchnorm60_fwd | BatchNorm | [1,1024,14,14] | 38.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
319 | resnetv24_stage3_activation60 | Activation | [1,1024,14,14] | 35.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
320 | resnetv24_stage3_conv61_fwd | Convolution | [1,1024,14,14] | 5835 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
321 | resnetv24_stage3_batchnorm61_fwd | BatchNorm | [1,256,14,14] | 81 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
322 | resnetv24_stage3_activation61 | Activation | [1,256,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
323 | resnetv24_stage3_conv62_fwd | Convolution | [1,256,14,14] | 12306 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
323 | resnetv24_stage3_conv62_fwd | Convolution | [1,256,14,14] | 12306 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
324 | resnetv24_stage3_batchnorm62_fwd | BatchNorm | [1,256,14,14] | 83 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
325 | resnetv24_stage3_activation62 | Activation | [1,256,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
326 | resnetv24_stage3_conv63_fwd | Convolution | [1,256,14,14] | 5503 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
327 | resnetv24_stage3__plus20 | elemwise_add | [1,1024,14,14] | 98.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
328 | resnetv24_stage3_batchnorm63_fwd | BatchNorm | [1,1024,14,14] | 35.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
329 | resnetv24_stage3_activation63 | Activation | [1,1024,14,14] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
330 | resnetv24_stage3_conv64_fwd | Convolution | [1,1024,14,14] | 5808.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
331 | resnetv24_stage3_batchnorm64_fwd | BatchNorm | [1,256,14,14] | 82.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
332 | resnetv24_stage3_activation64 | Activation | [1,256,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
333 | resnetv24_stage3_conv65_fwd | Convolution | [1,256,14,14] | 12788.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
333 | resnetv24_stage3_conv65_fwd | Convolution | [1,256,14,14] | 12788.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
334 | resnetv24_stage3_batchnorm65_fwd | BatchNorm | [1,256,14,14] | 92.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
335 | resnetv24_stage3_activation65 | Activation | [1,256,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
336 | resnetv24_stage3_conv66_fwd | Convolution | [1,256,14,14] | 5608.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
337 | resnetv24_stage3__plus21 | elemwise_add | [1,1024,14,14] | 103.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
338 | resnetv24_stage3_batchnorm66_fwd | BatchNorm | [1,1024,14,14] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
339 | resnetv24_stage3_activation66 | Activation | [1,1024,14,14] | 35.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
340 | resnetv24_stage3_conv67_fwd | Convolution | [1,1024,14,14] | 5825.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
341 | resnetv24_stage3_batchnorm67_fwd | BatchNorm | [1,256,14,14] | 91 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
342 | resnetv24_stage3_activation67 | Activation | [1,256,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
343 | resnetv24_stage3_conv68_fwd | Convolution | [1,256,14,14] | 12365 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
343 | resnetv24_stage3_conv68_fwd | Convolution | [1,256,14,14] | 12365 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
344 | resnetv24_stage3_batchnorm68_fwd | BatchNorm | [1,256,14,14] | 84.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
345 | resnetv24_stage3_activation68 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
346 | resnetv24_stage3_conv69_fwd | Convolution | [1,256,14,14] | 5505.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
347 | resnetv24_stage3__plus22 | elemwise_add | [1,1024,14,14] | 98 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
348 | resnetv24_stage3_batchnorm69_fwd | BatchNorm | [1,1024,14,14] | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
349 | resnetv24_stage3_activation69 | Activation | [1,1024,14,14] | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
350 | resnetv24_stage3_conv70_fwd | Convolution | [1,1024,14,14] | 5804.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
351 | resnetv24_stage3_batchnorm70_fwd | BatchNorm | [1,256,14,14] | 85 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
352 | resnetv24_stage3_activation70 | Activation | [1,256,14,14] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
353 | resnetv24_stage3_conv71_fwd | Convolution | [1,256,14,14] | 12488 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
353 | resnetv24_stage3_conv71_fwd | Convolution | [1,256,14,14] | 12488 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
354 | resnetv24_stage3_batchnorm71_fwd | BatchNorm | [1,256,14,14] | 85.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
355 | resnetv24_stage3_activation71 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
356 | resnetv24_stage3_conv72_fwd | Convolution | [1,256,14,14] | 5525.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
357 | resnetv24_stage3__plus23 | elemwise_add | [1,1024,14,14] | 109 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
358 | resnetv24_stage3_batchnorm72_fwd | BatchNorm | [1,1024,14,14] | 35.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
359 | resnetv24_stage3_activation72 | Activation | [1,1024,14,14] | 34.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
360 | resnetv24_stage3_conv73_fwd | Convolution | [1,1024,14,14] | 5883 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
361 | resnetv24_stage3_batchnorm73_fwd | BatchNorm | [1,256,14,14] | 84 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
362 | resnetv24_stage3_activation73 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
363 | resnetv24_stage3_conv74_fwd | Convolution | [1,256,14,14] | 12382 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
363 | resnetv24_stage3_conv74_fwd | Convolution | [1,256,14,14] | 12382 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
364 | resnetv24_stage3_batchnorm74_fwd | BatchNorm | [1,256,14,14] | 81.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
365 | resnetv24_stage3_activation74 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
366 | resnetv24_stage3_conv75_fwd | Convolution | [1,256,14,14] | 5536 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
367 | resnetv24_stage3__plus24 | elemwise_add | [1,1024,14,14] | 101.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
368 | resnetv24_stage3_batchnorm75_fwd | BatchNorm | [1,1024,14,14] | 38.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
369 | resnetv24_stage3_activation75 | Activation | [1,1024,14,14] | 33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
370 | resnetv24_stage3_conv76_fwd | Convolution | [1,1024,14,14] | 5819.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
371 | resnetv24_stage3_batchnorm76_fwd | BatchNorm | [1,256,14,14] | 88.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
372 | resnetv24_stage3_activation76 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
373 | resnetv24_stage3_conv77_fwd | Convolution | [1,256,14,14] | 12802.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
373 | resnetv24_stage3_conv77_fwd | Convolution | [1,256,14,14] | 12802.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
374 | resnetv24_stage3_batchnorm77_fwd | BatchNorm | [1,256,14,14] | 95 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
375 | resnetv24_stage3_activation77 | Activation | [1,256,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
376 | resnetv24_stage3_conv78_fwd | Convolution | [1,256,14,14] | 5648.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
377 | resnetv24_stage3__plus25 | elemwise_add | [1,1024,14,14] | 115 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
378 | resnetv24_stage3_batchnorm78_fwd | BatchNorm | [1,1024,14,14] | 36.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
379 | resnetv24_stage3_activation78 | Activation | [1,1024,14,14] | 39.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
380 | resnetv24_stage3_conv79_fwd | Convolution | [1,1024,14,14] | 5843.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
381 | resnetv24_stage3_batchnorm79_fwd | BatchNorm | [1,256,14,14] | 87.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
382 | resnetv24_stage3_activation79 | Activation | [1,256,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
383 | resnetv24_stage3_conv80_fwd | Convolution | [1,256,14,14] | 12305.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
383 | resnetv24_stage3_conv80_fwd | Convolution | [1,256,14,14] | 12305.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
384 | resnetv24_stage3_batchnorm80_fwd | BatchNorm | [1,256,14,14] | 77.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
385 | resnetv24_stage3_activation80 | Activation | [1,256,14,14] | 10.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
386 | resnetv24_stage3_conv81_fwd | Convolution | [1,256,14,14] | 5531.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
387 | resnetv24_stage3__plus26 | elemwise_add | [1,1024,14,14] | 98.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
388 | resnetv24_stage3_batchnorm81_fwd | BatchNorm | [1,1024,14,14] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
389 | resnetv24_stage3_activation81 | Activation | [1,1024,14,14] | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
390 | resnetv24_stage3_conv82_fwd | Convolution | [1,1024,14,14] | 5787.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
391 | resnetv24_stage3_batchnorm82_fwd | BatchNorm | [1,256,14,14] | 80.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
392 | resnetv24_stage3_activation82 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
393 | resnetv24_stage3_conv83_fwd | Convolution | [1,256,14,14] | 12363.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
393 | resnetv24_stage3_conv83_fwd | Convolution | [1,256,14,14] | 12363.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
394 | resnetv24_stage3_batchnorm83_fwd | BatchNorm | [1,256,14,14] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
395 | resnetv24_stage3_activation83 | Activation | [1,256,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
396 | resnetv24_stage3_conv84_fwd | Convolution | [1,256,14,14] | 5506.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
397 | resnetv24_stage3__plus27 | elemwise_add | [1,1024,14,14] | 95.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
398 | resnetv24_stage3_batchnorm84_fwd | BatchNorm | [1,1024,14,14] | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
399 | resnetv24_stage3_activation84 | Activation | [1,1024,14,14] | 34.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
400 | resnetv24_stage3_conv85_fwd | Convolution | [1,1024,14,14] | 5815.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
401 | resnetv24_stage3_batchnorm85_fwd | BatchNorm | [1,256,14,14] | 80.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
402 | resnetv24_stage3_activation85 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
403 | resnetv24_stage3_conv86_fwd | Convolution | [1,256,14,14] | 12328.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
403 | resnetv24_stage3_conv86_fwd | Convolution | [1,256,14,14] | 12328.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
404 | resnetv24_stage3_batchnorm86_fwd | BatchNorm | [1,256,14,14] | 81.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
405 | resnetv24_stage3_activation86 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
406 | resnetv24_stage3_conv87_fwd | Convolution | [1,256,14,14] | 5535 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
407 | resnetv24_stage3__plus28 | elemwise_add | [1,1024,14,14] | 101 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
408 | resnetv24_stage3_batchnorm87_fwd | BatchNorm | [1,1024,14,14] | 35.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
409 | resnetv24_stage3_activation87 | Activation | [1,1024,14,14] | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
410 | resnetv24_stage3_conv88_fwd | Convolution | [1,1024,14,14] | 5810.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
411 | resnetv24_stage3_batchnorm88_fwd | BatchNorm | [1,256,14,14] | 81.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
412 | resnetv24_stage3_activation88 | Activation | [1,256,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
413 | resnetv24_stage3_conv89_fwd | Convolution | [1,256,14,14] | 12484 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
413 | resnetv24_stage3_conv89_fwd | Convolution | [1,256,14,14] | 12484 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
414 | resnetv24_stage3_batchnorm89_fwd | BatchNorm | [1,256,14,14] | 88.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
415 | resnetv24_stage3_activation89 | Activation | [1,256,14,14] | 10.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
416 | resnetv24_stage3_conv90_fwd | Convolution | [1,256,14,14] | 5630.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
417 | resnetv24_stage3__plus29 | elemwise_add | [1,1024,14,14] | 111.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
418 | resnetv24_stage3_batchnorm90_fwd | BatchNorm | [1,1024,14,14] | 39.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
419 | resnetv24_stage3_activation90 | Activation | [1,1024,14,14] | 34.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
420 | resnetv24_stage3_conv91_fwd | Convolution | [1,1024,14,14] | 5913 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
421 | resnetv24_stage3_batchnorm91_fwd | BatchNorm | [1,256,14,14] | 91 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
422 | resnetv24_stage3_activation91 | Activation | [1,256,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
423 | resnetv24_stage3_conv92_fwd | Convolution | [1,256,14,14] | 12482.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
423 | resnetv24_stage3_conv92_fwd | Convolution | [1,256,14,14] | 12482.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
424 | resnetv24_stage3_batchnorm92_fwd | BatchNorm | [1,256,14,14] | 89 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
425 | resnetv24_stage3_activation92 | Activation | [1,256,14,14] | 11.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
426 | resnetv24_stage3_conv93_fwd | Convolution | [1,256,14,14] | 5555 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 31.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
427 | resnetv24_stage3__plus30 | elemwise_add | [1,1024,14,14] | 112.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
428 | resnetv24_stage3_batchnorm93_fwd | BatchNorm | [1,1024,14,14] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
429 | resnetv24_stage3_activation93 | Activation | [1,1024,14,14] | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
430 | resnetv24_stage3_conv94_fwd | Convolution | [1,1024,14,14] | 5855 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
431 | resnetv24_stage3_batchnorm94_fwd | BatchNorm | [1,256,14,14] | 89 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
432 | resnetv24_stage3_activation94 | Activation | [1,256,14,14] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
433 | resnetv24_stage3_conv95_fwd | Convolution | [1,256,14,14] | 12412.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
433 | resnetv24_stage3_conv95_fwd | Convolution | [1,256,14,14] | 12412.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
434 | resnetv24_stage3_batchnorm95_fwd | BatchNorm | [1,256,14,14] | 84.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
435 | resnetv24_stage3_activation95 | Activation | [1,256,14,14] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
436 | resnetv24_stage3_conv96_fwd | Convolution | [1,256,14,14] | 5518.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 31.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
437 | resnetv24_stage3__plus31 | elemwise_add | [1,1024,14,14] | 101.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
438 | resnetv24_stage3_batchnorm96_fwd | BatchNorm | [1,1024,14,14] | 35.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
439 | resnetv24_stage3_activation96 | Activation | [1,1024,14,14] | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
440 | resnetv24_stage3_conv97_fwd | Convolution | [1,1024,14,14] | 5813.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
441 | resnetv24_stage3_batchnorm97_fwd | BatchNorm | [1,256,14,14] | 93 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
442 | resnetv24_stage3_activation97 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
443 | resnetv24_stage3_conv98_fwd | Convolution | [1,256,14,14] | 12388.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
443 | resnetv24_stage3_conv98_fwd | Convolution | [1,256,14,14] | 12388.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
444 | resnetv24_stage3_batchnorm98_fwd | BatchNorm | [1,256,14,14] | 82 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
445 | resnetv24_stage3_activation98 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
446 | resnetv24_stage3_conv99_fwd | Convolution | [1,256,14,14] | 5564.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
447 | resnetv24_stage3__plus32 | elemwise_add | [1,1024,14,14] | 114.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
448 | resnetv24_stage3_batchnorm99_fwd | BatchNorm | [1,1024,14,14] | 36.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
449 | resnetv24_stage3_activation99 | Activation | [1,1024,14,14] | 35.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
450 | resnetv24_stage3_conv100_fwd | Convolution | [1,1024,14,14] | 5870 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
451 | resnetv24_stage3_batchnorm100_fwd | BatchNorm | [1,256,14,14] | 89.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
452 | resnetv24_stage3_activation100 | Activation | [1,256,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
453 | resnetv24_stage3_conv101_fwd | Convolution | [1,256,14,14] | 12389.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
453 | resnetv24_stage3_conv101_fwd | Convolution | [1,256,14,14] | 12389.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
454 | resnetv24_stage3_batchnorm101_fwd | BatchNorm | [1,256,14,14] | 93 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
455 | resnetv24_stage3_activation101 | Activation | [1,256,14,14] | 12.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
456 | resnetv24_stage3_conv102_fwd | Convolution | [1,256,14,14] | 5628 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
457 | resnetv24_stage3__plus33 | elemwise_add | [1,1024,14,14] | 107.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
458 | resnetv24_stage3_batchnorm102_fwd | BatchNorm | [1,1024,14,14] | 36.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
459 | resnetv24_stage3_activation102 | Activation | [1,1024,14,14] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
460 | resnetv24_stage3_conv103_fwd | Convolution | [1,1024,14,14] | 5887.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
461 | resnetv24_stage3_batchnorm103_fwd | BatchNorm | [1,256,14,14] | 91 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
462 | resnetv24_stage3_activation103 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
463 | resnetv24_stage3_conv104_fwd | Convolution | [1,256,14,14] | 12449.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
463 | resnetv24_stage3_conv104_fwd | Convolution | [1,256,14,14] | 12449.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
464 | resnetv24_stage3_batchnorm104_fwd | BatchNorm | [1,256,14,14] | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
465 | resnetv24_stage3_activation104 | Activation | [1,256,14,14] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
466 | resnetv24_stage3_conv105_fwd | Convolution | [1,256,14,14] | 5556 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
467 | resnetv24_stage3__plus34 | elemwise_add | [1,1024,14,14] | 103 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
468 | resnetv24_stage3_batchnorm105_fwd | BatchNorm | [1,1024,14,14] | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
469 | resnetv24_stage3_activation105 | Activation | [1,1024,14,14] | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
470 | resnetv24_stage3_conv106_fwd | Convolution | [1,1024,14,14] | 5809.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
471 | resnetv24_stage3_batchnorm106_fwd | BatchNorm | [1,256,14,14] | 82.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
472 | resnetv24_stage3_activation106 | Activation | [1,256,14,14] | 11.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
473 | resnetv24_stage3_conv107_fwd | Convolution | [1,256,14,14] | 12373.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 63.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
473 | resnetv24_stage3_conv107_fwd | Convolution | [1,256,14,14] | 12373.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 12.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
474 | resnetv24_stage3_batchnorm107_fwd | BatchNorm | [1,256,14,14] | 77 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
475 | resnetv24_stage3_activation107 | Activation | [1,256,14,14] | 10.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
476 | resnetv24_stage3_conv108_fwd | Convolution | [1,256,14,14] | 5503.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
477 | resnetv24_stage3__plus35 | elemwise_add | [1,1024,14,14] | 102 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
478 | resnetv24_stage4_batchnorm0_fwd | BatchNorm | [1,1024,14,14] | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
479 | resnetv24_stage4_activation0 | Activation | [1,1024,14,14] | 35.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
480 | resnetv24_stage4_conv0_fwd | Convolution | [1,1024,14,14] | 10602.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
481 | resnetv24_stage4_batchnorm1_fwd | BatchNorm | [1,512,14,14] | 89 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
482 | resnetv24_stage4_activation1 | Activation | [1,512,14,14] | 19.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
483 | resnetv24_stage4_conv1_fwd | Convolution | [1,512,14,14] | 15019 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::explicit_convolve_sgemm<float, int, 128, 5, 5, 3, 3, 3, 0, true>(int, int, int, float const*, int, float const*, int, float*, kernel_conv_params, int, int, float, float, int, float*, float*) | 269.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
483 | resnetv24_stage4_conv1_fwd | Convolution | [1,512,14,14] | 15019 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void im2col4d_kernel<float, int>(im2col4d_params, cudnnConvolutionStruct, cudnnTensor4dStruct, float const*, float*, int) | 39.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
484 | resnetv24_stage4_batchnorm2_fwd | BatchNorm | [1,512,7,7] | 84.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
485 | resnetv24_stage4_activation2 | Activation | [1,512,7,7] | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
486 | resnetv24_stage4_conv2_fwd | Convolution | [1,512,7,7] | 6845.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 46.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
487 | resnetv24_stage4_conv3_fwd | Convolution | [1,1024,14,14] | 12087 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 86.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
488 | resnetv24_stage4__plus0 | elemwise_add | [1,2048,7,7] | 90.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
489 | resnetv24_stage4_batchnorm3_fwd | BatchNorm | [1,2048,7,7] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
490 | resnetv24_stage4_activation3 | Activation | [1,2048,7,7] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
491 | resnetv24_stage4_conv4_fwd | Convolution | [1,2048,7,7] | 7106.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 112.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
492 | resnetv24_stage4_batchnorm4_fwd | BatchNorm | [1,512,7,7] | 90 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
493 | resnetv24_stage4_activation4 | Activation | [1,512,7,7] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
494 | resnetv24_stage4_conv5_fwd | Convolution | [1,512,7,7] | 15623.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 121.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
494 | resnetv24_stage4_conv5_fwd | Convolution | [1,512,7,7] | 15623.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 51.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
495 | resnetv24_stage4_batchnorm5_fwd | BatchNorm | [1,512,7,7] | 94.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
496 | resnetv24_stage4_activation5 | Activation | [1,512,7,7] | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
497 | resnetv24_stage4_conv6_fwd | Convolution | [1,512,7,7] | 6892.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 46.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
498 | resnetv24_stage4__plus1 | elemwise_add | [1,2048,7,7] | 100 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
499 | resnetv24_stage4_batchnorm6_fwd | BatchNorm | [1,2048,7,7] | 37.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
500 | resnetv24_stage4_activation6 | Activation | [1,2048,7,7] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
501 | resnetv24_stage4_conv7_fwd | Convolution | [1,2048,7,7] | 7040 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 112.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
502 | resnetv24_stage4_batchnorm7_fwd | BatchNorm | [1,512,7,7] | 97 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
503 | resnetv24_stage4_activation7 | Activation | [1,512,7,7] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
504 | resnetv24_stage4_conv8_fwd | Convolution | [1,512,7,7] | 15326.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 121.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
504 | resnetv24_stage4_conv8_fwd | Convolution | [1,512,7,7] | 15326.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 50.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
505 | resnetv24_stage4_batchnorm8_fwd | BatchNorm | [1,512,7,7] | 93.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
506 | resnetv24_stage4_activation8 | Activation | [1,512,7,7] | 7.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 3.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
507 | resnetv24_stage4_conv9_fwd | Convolution | [1,512,7,7] | 6889.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 45.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
508 | resnetv24_stage4__plus2 | elemwise_add | [1,2048,7,7] | 106 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 3.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
509 | resnetv24_batchnorm2_fwd | BatchNorm | [1,2048,7,7] | 36.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
510 | resnetv24_relu1_fwd | Activation | [1,2048,7,7] | 20.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
511 | resnetv24_pool1_fwd | Pooling | [1,2048,7,7] | 156.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 6.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
513 | resnetv24_dense0_fwd | FullyConnected | [1,2048] | 2077.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void gemv2T_kernel_val<int, int, float, float, float, 128, 16, 4, 4, false, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>, float, float) | 20.00 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true | |
513 | resnetv24_dense0_fwd | FullyConnected | [1,2048] | 2077.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 2.67 | 0 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | true |
Showing 1 to 579 of 579 entries