GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | resnetv13_conv0_fwd | Convolution | [1,3,224,224] | 20052.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 44.00 | 244858880 | 202026.67 | 1160394.67 | 21.10 | 179.72 | 5564.97 | false | 0.208131;0.215573;0.209961;0.212096;0.210669 | 244858880;244858880;244858880;244858880;244858880 | 171136;163200;254208;210176;224768 | 1139520;1122848;1196032;1170656;1171008 | |
1 | resnetv13_batchnorm0_fwd | BatchNorm | [1,64,112,112] | 297.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 12.00 | 4849664 | 20640.00 | 4386869.33 | 23.70 | 1.10 | 404.14 | true | 0.236486;0.236944;0.236750;0.237090;0.236709 | 4849664;4849664;4849664;4849664;4849664 | 19872;16416;23712;28192;18336 | 4396192;4394432;4341856;4392672;4373504 | |
2 | resnetv13_relu0_fwd | Activation | [1,64,112,112] | 253.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 53237.33 | 1161909.33 | 76.40 | 1.32 | 229.38 | true | 0.762313;0.761125;0.765990;0.763053;0.772535 | 1605632;1605632;1605632;1605632;1605632 | 59296;48160;51488;64032;48928 | 1162688;1165696;1154816;1157344;1184832 | |
3 | resnetv13_pool0_fwd | Pooling | [1,64,112,112] | 3298 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 8.67 | 200704 | 1088.00 | 1140032.00 | 53.30 | 0.18 | 23.16 | true | 0.532404;0.532303;0.532290;0.532860;0.534755 | 200704;200704;200704;200704;200704 | 64;1088;1088;1088;3904 | 1144576;1130368;1147040;1137152;1138368 | |
4 | resnetv13_stage1_conv0_fwd | Convolution | [1,64,56,56] | 2259.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 13.00 | 25890816 | 16384.00 | 707509.33 | 7.00 | 35.77 | 1991.60 | false | 0.070245;0.070212;0.070052;0.069369;0.069855 | 25890816;25890816;25890816;25890816;25890816 | 717024;701312;701888;711808;708832 | 16384;16384;16384;16384;16384 | |
4 | resnetv13_stage1_conv0_fwd | Convolution | [1,64,56,56] | 2259.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1003520 | 480.00 | 1792.00 | 67.50 | 441.69 | 167.25 | false | 0.683169;0.679921;0.672458;0.671524;0.672500 | 1003520;1003520;1003520;1003520;1003520 | 480;480;480;480;480 | 1664;2048;1408;1664;3712 | |
5 | resnetv13_stage1_batchnorm0_fwd | BatchNorm | [1,64,56,56] | 81.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 1236992 | 1024.00 | 681493.33 | 22.70 | 1.81 | 218.28 | true | 0.227439;0.228520;0.227605;0.227171;0.227325 | 1236992;1236992;1236992;1236992;1236992 | 1024;1024;1024;1024;1024 | 688608;684512;678912;681056;676320 | |
6 | resnetv13_stage1_relu0_fwd | Activation | [1,64,56,56] | 67 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 401408 | 0.00 | 360469.33 | 52.10 | 1.11 | 86.01 | true | 0.521116;0.522097;0.522056;0.519293;0.519095 | 401408;401408;401408;401408;401408 | 512;0;0;0;0 | 357952;362048;360960;365152;358400 | |
7 | resnetv13_stage1_conv1_fwd | Convolution | [1,64,56,56] | 19681.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 21.67 | 124436480 | 341.33 | 340650.67 | 12.50 | 364.92 | 5743.13 | false | 0.124712;0.124720;0.124707;0.124720;0.124716 | 124436480;124436480;124436480;124436480;124436480 | 1280;512;256;256;0 | 346720;342240;343200;336512;331840 | |
7 | resnetv13_stage1_conv1_fwd | Convolution | [1,64,56,56] | 19681.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147626.67 | 68245.33 | 6.20 | 1.10 | 50.90 | true | 0.062345;0.062349;0.062350;0.062346;0.062351 | 237568;237568;237568;237568;237568 | 59200;62464;70048;74240;72224 | 147712;147712;147712;147456;147456 | |
8 | resnetv13_stage1_batchnorm1_fwd | BatchNorm | [1,64,56,56] | 115 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 1236992 | 1024.00 | 426.67 | 22.80 | 852.71 | 247.40 | false | 0.228269;0.227307;0.227525;0.227861;0.227264 | 1236992;1236992;1236992;1236992;1236992 | 1024;1024;1024;1024;1024 | 256;512;1024;256;512 | |
9 | resnetv13_stage1_relu1_fwd | Activation | [1,64,56,56] | 67.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 401408 | 0.00 | 256.00 | 52.10 | 1568.00 | 80.28 | false | 0.519858;0.520251;0.515575;0.526296;0.523559 | 401408;401408;401408;401408;401408 | 0;0;0;0;0 | 256;256;256;256;256 | |
10 | resnetv13_stage1_conv2_fwd | Convolution | [1,64,56,56] | 7199 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 24.00 | 106496000 | 68864.00 | 2340149.33 | 8.10 | 44.21 | 4437.33 | false | 0.080624;0.080829;0.080480;0.080430;0.081088 | 106496000;106496000;106496000;106496000;106496000 | 69376;69120;68096;68096;74496 | 2299488;2345408;2351648;2323392;2417280 | |
10 | resnetv13_stage1_conv2_fwd | Convolution | [1,64,56,56] | 7199 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 10.00 | 4014080 | 1024.00 | 1449792.00 | 69.80 | 2.77 | 401.41 | true | 0.705890;0.733163;0.692310;0.694136;0.694501 | 4014080;4014080;4014080;4014080;4014080 | 1024;1024;1024;1024;1024 | 1483328;1424320;1446560;1478336;1424480 | |
10 | resnetv13_stage1_conv2_fwd | Convolution | [1,64,56,56] | 7199 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 0.00 | 42.67 | 6.10 | 0.00 | 0.00 | true | 0.060927;0.060935;0.060917;0.060938;0.060984 | 0;0;0;0;0 | 1024;0;0;0;0 | 640;0;0;128;0 | |
11 | resnetv13_stage1_batchnorm2_fwd | BatchNorm | [1,256,56,56] | 536.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 4947968 | 85920.00 | 3254773.33 | 64.30 | 1.48 | 549.77 | true | 0.634194;0.636456;0.653603;0.647734;0.644563 | 4947968;4947968;4947968;4947968;4947968 | 3251680;3263008;3265504;3242784;3249632 | 82208;91296;93600;81952;84256 | |
12 | resnetv13_stage1_conv3_fwd | Convolution | [1,64,56,56] | 7047.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 24.00 | 106496000 | 868416.00 | 3135029.33 | 8.00 | 26.60 | 4437.33 | false | 0.080238;0.080321;0.080277;0.080238;0.080372 | 106496000;106496000;106496000;106496000;106496000 | 868416;868416;879424;868352;868416 | 3143072;3134144;3126304;3129088;3141856 | |
12 | resnetv13_stage1_conv3_fwd | Convolution | [1,64,56,56] | 7047.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 21248.00 | 6.10 | 0.00 | 0.00 | true | 0.061177;0.061187;0.061209;0.061224;0.061197 | 0;0;0;0;0 | 96;96;96;96;96 | 21504;21632;21504;20736;20224 | |
13 | resnetv13_stage1_batchnorm3_fwd | BatchNorm | [1,256,56,56] | 655.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 4947968 | 74858.67 | 3245408.00 | 65.40 | 1.49 | 549.77 | true | 0.646061;0.661461;0.656485;0.654729;0.651468 | 4947968;4947968;4947968;4947968;4947968 | 81600;73920;72640;77888;72768 | 3266592;3245792;3216576;3246496;3243936 | |
14 | add_resnetv13_stage1_activation0 | add_relu | [1,256,56,56] | 340.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 13.00 | 802816 | 4093632.00 | 3178421.33 | 79.90 | 0.11 | 61.76 | true | 0.799006;0.795642;0.801372;0.789014;0.806283 | 802816;802816;802816;802816;802816 | 4102592;4104384;4088896;4089408;4084416 | 3160224;3171776;3188704;3180992;3182496 | |
15 | resnetv13_stage1_conv4_fwd | Convolution | [1,256,56,56] | 7471.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 102961152 | 100032.00 | 927040.00 | 6.90 | 100.25 | 3432.04 | false | 0.069093;0.068944;0.068547;0.069339;0.069180 | 102961152;102961152;102961152;102961152;102961152 | 91712;98112;103424;102272;99712 | 926784;911872;946912;934880;919456 | |
15 | resnetv13_stage1_conv4_fwd | Convolution | [1,256,56,56] | 7471.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1003520 | 544.00 | 268394.67 | 70.60 | 3.73 | 167.25 | true | 0.708378;0.699542;0.710115;0.731857;0.697275 | 1003520;1003520;1003520;1003520;1003520 | 544;544;544;544;544 | 269024;283008;250048;259840;276320 | |
16 | resnetv13_stage1_batchnorm4_fwd | BatchNorm | [1,64,56,56] | 59 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.33 | 1236992 | 1024.00 | 557248.00 | 22.80 | 2.22 | 231.95 | true | 0.227768;0.227062;0.227631;0.227934;0.227323 | 1236992;1236992;1236992;1236992;1236992 | 1024;1024;1024;1024;1024 | 574624;557344;556512;557888;555584 | |
17 | resnetv13_stage1_relu2_fwd | Activation | [1,64,56,56] | 71.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 401408 | 0.00 | 245930.67 | 51.90 | 1.63 | 92.64 | true | 0.518225;0.520308;0.525605;0.514871;0.518739 | 401408;401408;401408;401408;401408 | 0;0;0;0;0 | 227328;245504;247424;245248;247040 | |
18 | resnetv13_stage1_conv5_fwd | Convolution | [1,64,56,56] | 18682.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 21.00 | 124436480 | 0.00 | 261717.33 | 12.50 | 475.46 | 5925.55 | false | 0.124715;0.124708;0.124713;0.124712;0.124719 | 124436480;124436480;124436480;124436480;124436480 | 256;0;0;0;0 | 261248;261504;262272;261376;262400 | |
18 | resnetv13_stage1_conv5_fwd | Convolution | [1,64,56,56] | 18682.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147456.00 | 1429.33 | 6.20 | 1.60 | 50.90 | true | 0.062331;0.062331;0.062334;0.062336;0.062338 | 237568;237568;237568;237568;237568 | 147456;147456;147456;147456;147456 | 1536;1440;1312;1568;1312 | |
19 | resnetv13_stage1_batchnorm5_fwd | BatchNorm | [1,64,56,56] | 131 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.33 | 1236992 | 1024.00 | 256.00 | 22.70 | 966.40 | 231.95 | false | 0.227431;0.227174;0.227539;0.227645;0.226906 | 1236992;1236992;1236992;1236992;1236992 | 1024;1024;1024;1024;1024 | 256;256;256;256;256 | |
20 | resnetv13_stage1_relu3_fwd | Activation | [1,64,56,56] | 69 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 401408 | 0.00 | 256.00 | 51.90 | 1568.00 | 92.64 | false | 0.518520;0.518036;0.519487;0.519520;0.519927 | 401408;401408;401408;401408;401408 | 256;256;384;256;256 | 0;0;0;0;0 | |
21 | resnetv13_stage1_conv6_fwd | Convolution | [1,64,56,56] | 7217.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 24.00 | 106496000 | 65600.00 | 2412885.33 | 8.10 | 42.97 | 4437.33 | false | 0.080887;0.080683;0.080577;0.080612;0.080649 | 106496000;106496000;106496000;106496000;106496000 | 65600;65600;65600;65600;65600 | 2392352;2418784;2425376;2394496;2436256 | |
21 | resnetv13_stage1_conv6_fwd | Convolution | [1,64,56,56] | 7217.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 10.67 | 4014080 | 1248.00 | 1491850.67 | 71.00 | 2.69 | 376.31 | true | 0.708484;0.708368;0.716750;0.705050;0.713553 | 4014080;4014080;4014080;4014080;4014080 | 1248;1248;1248;1248;1248 | 1519264;1491168;1483008;1500096;1484288 | |
21 | resnetv13_stage1_conv6_fwd | Convolution | [1,64,56,56] | 7217.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 0.00 | 42.67 | 6.10 | 0.00 | 0.00 | true | 0.060916;0.060929;0.060991;0.060999;0.060922 | 0;0;0;0;0 | 0;0;0;6912;0 | 128;0;0;128;0 | |
22 | resnetv13_stage1_batchnorm6_fwd | BatchNorm | [1,256,56,56] | 218 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 4947968 | 74965.33 | 3198293.33 | 65.70 | 1.51 | 549.77 | true | 0.658163;0.656121;0.659251;0.655751;0.651116 | 4947968;4947968;4947968;4947968;4947968 | 76608;71264;77024;67040;77024 | 3194208;3200448;3197856;3196576;3214912 | |
23 | add_resnetv13_stage1_activation1 | add_relu | [1,256,56,56] | 349 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 10.67 | 802816 | 3877866.67 | 1488853.33 | 76.60 | 0.15 | 75.26 | true | 0.766026;0.759945;0.771583;0.768256;0.764799 | 802816;802816;802816;802816;802816 | 3880640;3888448;3869760;3878080;3874880 | 1495808;1453184;1491904;1492064;1482592 | |
24 | resnetv13_stage1_conv7_fwd | Convolution | [1,256,56,56] | 7483 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 29.00 | 102961152 | 68330.67 | 781973.33 | 6.90 | 121.09 | 3550.38 | false | 0.069036;0.069525;0.069617;0.067760;0.068227 | 102961152;102961152;102961152;102961152;102961152 | 69632;66816;67264;70464;68096 | 787840;778912;765056;784832;782176 | |
24 | resnetv13_stage1_conv7_fwd | Convolution | [1,256,56,56] | 7483 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1003520 | 544.00 | 265365.33 | 69.30 | 3.77 | 167.25 | true | 0.690781;0.688797;0.688372;0.714720;0.698264 | 1003520;1003520;1003520;1003520;1003520 | 544;544;544;544;544 | 259168;268288;275296;251744;268640 | |
25 | resnetv13_stage1_batchnorm7_fwd | BatchNorm | [1,64,56,56] | 62.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.33 | 1236992 | 1024.00 | 546784.00 | 22.70 | 2.26 | 231.95 | true | 0.227279;0.227136;0.227433;0.227488;0.228621 | 1236992;1236992;1236992;1236992;1236992 | 544224;543488;548544;552352;547584 | 1024;1024;1024;6144;1024 | |
26 | resnetv13_stage1_relu4_fwd | Activation | [1,64,56,56] | 67.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 401408 | 0.00 | 255978.67 | 52.10 | 1.57 | 86.01 | true | 0.522124;0.513325;0.521553;0.523886;0.519602 | 401408;401408;401408;401408;401408 | 258752;259328;254336;251232;254848 | 0;0;0;0;0 | |
27 | resnetv13_stage1_conv8_fwd | Convolution | [1,64,56,56] | 18348.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 21.33 | 124436480 | 0.00 | 261706.67 | 12.50 | 475.48 | 5833.05 | false | 0.124718;0.124711;0.124719;0.124720;0.124722 | 124436480;124436480;124436480;124436480;124436480 | 512;0;0;0;0 | 261760;261152;262272;261184;262176 | |
27 | resnetv13_stage1_conv8_fwd | Convolution | [1,64,56,56] | 18348.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 237568 | 147456.00 | 1621.33 | 6.20 | 1.59 | 59.39 | true | 0.062333;0.062335;0.062338;0.062333;0.062333 | 237568;237568;237568;237568;237568 | 147712;147456;147456;147456;147456 | 1664;1664;1536;1536;1664 | |
28 | resnetv13_stage1_batchnorm8_fwd | BatchNorm | [1,64,56,56] | 120 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 1236992 | 2474.67 | 256.00 | 22.80 | 453.00 | 218.28 | false | 0.227206;0.227604;0.227452;0.227880;0.227843 | 1236992;1236992;1236992;1236992;1236992 | 1024;1024;1024;5376;6912 | 256;256;384;256;256 | |
29 | resnetv13_stage1_relu5_fwd | Activation | [1,64,56,56] | 68.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 401408 | 0.00 | 298.67 | 52.30 | 1344.00 | 86.01 | false | 0.523062;0.522401;0.520044;0.525938;0.522729 | 401408;401408;401408;401408;401408 | 0;0;0;0;0 | 384;256;256;384;256 | |
30 | resnetv13_stage1_conv9_fwd | Convolution | [1,64,56,56] | 7188.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 24.00 | 106496000 | 65600.00 | 2414848.00 | 8.10 | 42.93 | 4437.33 | false | 0.081060;0.080762;0.080676;0.080698;0.080805 | 106496000;106496000;106496000;106496000;106496000 | 2399264;2401632;2426592;2447104;2416320 | 65600;65600;65536;65600;65600 | |
30 | resnetv13_stage1_conv9_fwd | Convolution | [1,64,56,56] | 7188.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 10.00 | 4014080 | 1248.00 | 1482389.33 | 70.10 | 2.71 | 401.41 | true | 0.693982;0.721987;0.692082;0.702893;0.706605 | 4014080;4014080;4014080;4014080;4014080 | 1478592;1484608;1494720;1441760;1483968 | 1248;1248;8416;1248;1248 | |
30 | resnetv13_stage1_conv9_fwd | Convolution | [1,64,56,56] | 7188.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 0.00 | 42.67 | 6.10 | 0.00 | 0.00 | true | 0.060987;0.060915;0.060923;0.060921;0.060932 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;0;0;128 | |
31 | resnetv13_stage1_batchnorm9_fwd | BatchNorm | [1,256,56,56] | 194 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 4947968 | 57397.33 | 3186474.67 | 65.30 | 1.53 | 549.77 | true | 0.637855;0.660089;0.653617;0.658610;0.647233 | 4947968;4947968;4947968;4947968;4947968 | 3179584;3187200;3145920;3192640;3194240 | 59488;57056;56416;56416;58720 | |
32 | add_resnetv13_stage1_activation2 | add_relu | [1,256,56,56] | 349 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 10.33 | 802816 | 3896426.67 | 1504010.67 | 76.70 | 0.15 | 77.69 | true | 0.763994;0.768323;0.755191;0.767356;0.776450 | 802816;802816;802816;802816;802816 | 3880256;3886016;3906368;3925184;3896896 | 1493856;1487744;1507552;1515104;1510624 | |
33 | resnetv13_stage2_conv0_fwd | Convolution | [1,256,56,56] | 3935.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 28.67 | 52529152 | 137770.67 | 588181.33 | 3.90 | 72.36 | 1832.39 | false | 0.038847;0.038994;0.039014;0.038765;0.039041 | 52529152;52529152;52529152;52529152;52529152 | 565632;575392;598688;620384;590464 | 137600;137792;137920;137408;138816 | |
33 | resnetv13_stage2_conv0_fwd | Convolution | [1,256,56,56] | 3935.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 501760 | 800.00 | 57205.33 | 54.70 | 8.65 | 100.35 | true | 0.554668;0.539447;0.551420;0.550440;0.526589 | 501760;501760;501760;501760;501760 | 800;800;800;800;800 | 59936;52640;59040;60256;40352 | |
34 | resnetv13_stage2_batchnorm0_fwd | BatchNorm | [1,128,28,28] | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 634880 | 2048.00 | 303189.33 | 17.80 | 2.08 | 146.52 | true | 0.177551;0.177895;0.177565;0.177270;0.177674 | 634880;634880;634880;634880;634880 | 2048;2048;2048;2048;2048 | 300960;304128;299968;304800;304480 | |
35 | resnetv13_stage2_relu0_fwd | Activation | [1,128,28,28] | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 200704 | 0.00 | 89450.67 | 48.00 | 2.24 | 50.18 | true | 0.477423;0.478767;0.480396;0.480123;0.479766 | 200704;200704;200704;200704;200704 | 768;0;0;0;0 | 91040;88384;90496;89472;87616 | |
36 | resnetv13_stage2_conv1_fwd | Convolution | [1,128,28,28] | 16560.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 140836864 | 0.00 | 397248.00 | 12.50 | 354.53 | 4023.91 | false | 0.124811;0.124813;0.124814;0.124818;0.124814 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;0 | 397472;396800;397120;397152;397760 | |
36 | resnetv13_stage2_conv1_fwd | Convolution | [1,128,28,28] | 16560.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.33 | 950272 | 589824.00 | 649109.33 | 9.80 | 0.77 | 178.19 | true | 0.097809;0.097975;0.097717;0.097562;0.097643 | 950272;950272;950272;950272;950272 | 589824;589824;589824;589824;589824 | 648704;649504;649120;650080;648128 | |
37 | resnetv13_stage2_batchnorm1_fwd | BatchNorm | [1,128,28,28] | 100.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 634880 | 2048.00 | 298.67 | 17.80 | 270.55 | 126.98 | false | 0.177802;0.177594;0.177792;0.178016;0.177722 | 634880;634880;634880;634880;634880 | 384;256;256;384;256 | 2048;2048;2048;2048;2048 | |
38 | resnetv13_stage2_relu1_fwd | Activation | [1,128,28,28] | 37.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 200704 | 0.00 | 128.00 | 47.80 | 1568.00 | 50.18 | false | 0.478210;0.477433;0.479305;0.478252;0.477809 | 200704;200704;200704;200704;200704 | 0;0;0;0;0 | 0;384;0;256;128 | |
39 | resnetv13_stage2_conv2_fwd | Convolution | [1,128,28,28] | 6967 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 105259008 | 294229.33 | 1096064.00 | 11.80 | 75.71 | 4210.36 | false | 0.118449;0.117359;0.117025;0.119806;0.118512 | 105259008;105259008;105259008;105259008;105259008 | 1089248;1116384;1082560;1119264;1078240 | 281472;298496;293248;293632;295808 | |
39 | resnetv13_stage2_conv2_fwd | Convolution | [1,128,28,28] | 6967 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 2007040 | 2048.00 | 141120.00 | 69.80 | 14.02 | 286.72 | true | 0.694682;0.698763;0.702759;0.697010;0.696765 | 2007040;2007040;2007040;2007040;2007040 | 2048;2048;2048;2048;2048 | 136128;128800;158432;117728;163808 | |
40 | resnetv13_stage2_batchnorm2_fwd | BatchNorm | [1,512,28,28] | 120.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 2539520 | 8192.00 | 1159946.67 | 56.80 | 2.17 | 423.25 | true | 0.568660;0.565401;0.565874;0.570232;0.568170 | 2539520;2539520;2539520;2539520;2539520 | 8192;8192;8192;8192;8192 | 1150400;1160064;1159904;1159872;1161632 | |
41 | resnetv13_stage2_conv3_fwd | Convolution | [1,256,56,56] | 13478.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.00 | 210116608 | 2339541.33 | 905941.33 | 12.70 | 64.74 | 5252.92 | false | 0.126673;0.127540;0.127005;0.125290;0.129207 | 210116608;210116608;210116608;210116608;210116608 | 2340352;2340288;2342272;2337984;2337408 | 925696;911200;899680;903392;903232 | |
42 | resnetv13_stage2_batchnorm3_fwd | BatchNorm | [1,512,28,28] | 165.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 2539520 | 8330.67 | 1398090.67 | 58.00 | 1.81 | 423.25 | true | 0.578467;0.583414;0.584626;0.577698;0.563433 | 2539520;2539520;2539520;2539520;2539520 | 8352;13920;8352;8256;8288 | 1390016;1399968;1401408;1398272;1396032 | |
43 | add_resnetv13_stage2_activation0 | add_relu | [1,512,28,28] | 151.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 6.00 | 401408 | 636288.00 | 1300576.00 | 59.20 | 0.21 | 66.90 | true | 0.593232;0.595178;0.593432;0.587902;0.590517 | 401408;401408;401408;401408;401408 | 639872;644096;634112;633728;634880 | 1306688;1299232;1300064;1296960;1302432 | |
44 | resnetv13_stage2_conv4_fwd | Convolution | [1,512,28,28] | 7181.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 51.00 | 104957952 | 262144.00 | 790496.00 | 3.90 | 99.71 | 2058.00 | false | 0.039172;0.038979;0.039053;0.039722;0.039085 | 104957952;104957952;104957952;104957952;104957952 | 262144;262144;262144;262144;262144 | 787424;790688;790432;791104;790368 | |
44 | resnetv13_stage2_conv4_fwd | Convolution | [1,512,28,28] | 7181.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 501760 | 736.00 | 1216.00 | 55.70 | 257.05 | 100.35 | false | 0.551275;0.562098;0.557903;0.564592;0.550026 | 501760;501760;501760;501760;501760 | 736;608;736;736;736 | 1184;1280;1184;1184;1312 | |
45 | resnetv13_stage2_batchnorm4_fwd | BatchNorm | [1,128,28,28] | 32 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 634880 | 2048.00 | 259584.00 | 17.80 | 2.43 | 126.98 | true | 0.177523;0.177607;0.177639;0.177700;0.177821 | 634880;634880;634880;634880;634880 | 2048;2048;2048;2048;2048 | 258976;259232;260544;261792;256032 | |
46 | resnetv13_stage2_relu2_fwd | Activation | [1,128,28,28] | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 200704 | 0.00 | 131648.00 | 47.80 | 1.52 | 50.18 | true | 0.478147;0.478430;0.477985;0.478894;0.480221 | 200704;200704;200704;200704;200704 | 0;0;0;0;0 | 130272;132160;132512;130048;133664 | |
47 | resnetv13_stage2_conv5_fwd | Convolution | [1,128,28,28] | 16103.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 140836864 | 0.00 | 393130.67 | 12.50 | 358.24 | 4023.91 | false | 0.124815;0.124816;0.124819;0.124812;0.124816 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;2560;0 | 393344;392064;393344;392832;393216 | |
47 | resnetv13_stage2_conv5_fwd | Convolution | [1,128,28,28] | 16103.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.33 | 950272 | 589824.00 | 655776.00 | 9.80 | 0.76 | 178.19 | true | 0.097801;0.097918;0.097624;0.098147;0.097589 | 950272;950272;950272;950272;950272 | 589824;589824;589824;589824;589824 | 655744;655712;655424;655872;655936 | |
48 | resnetv13_stage2_batchnorm5_fwd | BatchNorm | [1,128,28,28] | 93.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 634880 | 2048.00 | 256.00 | 17.80 | 275.56 | 126.98 | false | 0.177966;0.177438;0.177588;0.179200;0.178174 | 634880;634880;634880;634880;634880 | 2048;2048;2048;2048;2304 | 256;256;256;256;256 | |
49 | resnetv13_stage2_relu3_fwd | Activation | [1,128,28,28] | 35.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 200704 | 0.00 | 256.00 | 47.90 | 784.00 | 40.14 | false | 0.477806;0.479166;0.478750;0.478108;0.481336 | 200704;200704;200704;200704;200704 | 256;256;384;256;256 | 0;0;0;0;0 | |
50 | resnetv13_stage2_conv6_fwd | Convolution | [1,128,28,28] | 6993 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 105259008 | 269205.33 | 1070144.00 | 11.80 | 78.59 | 4210.36 | false | 0.117043;0.118684;0.117773;0.117191;0.118692 | 105259008;105259008;105259008;105259008;105259008 | 269184;268288;265536;270144;271488 | 1060192;1089312;1049600;1096256;1060928 | |
50 | resnetv13_stage2_conv6_fwd | Convolution | [1,128,28,28] | 6993 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 2007040 | 2048.00 | 143712.00 | 69.90 | 13.77 | 286.72 | true | 0.696668;0.703130;0.698117;0.696447;0.710733 | 2007040;2007040;2007040;2007040;2007040 | 152800;122976;159904;117216;155360 | 2048;2048;2048;2048;2048 | |
51 | resnetv13_stage2_batchnorm6_fwd | BatchNorm | [1,512,28,28] | 118.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 2539520 | 8192.00 | 1153002.67 | 56.90 | 2.19 | 423.25 | true | 0.569880;0.570959;0.566994;0.565657;0.571668 | 2539520;2539520;2539520;2539520;2539520 | 1153984;1155200;1152640;1152384;1151424 | 8192;8192;8192;8192;8192 | |
52 | add_resnetv13_stage2_activation1 | add_relu | [1,512,28,28] | 160.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 6.00 | 401408 | 1225216.00 | 146432.00 | 58.40 | 0.29 | 66.90 | true | 0.573239;0.583148;0.585195;0.583968;0.590257 | 401408;401408;401408;401408;401408 | 1224320;1225088;1226240;1217024;1236736 | 153248;137696;145024;144704;149568 | |
53 | resnetv13_stage2_conv7_fwd | Convolution | [1,512,28,28] | 7221 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 51.00 | 104957952 | 262144.00 | 150506.67 | 3.90 | 254.35 | 2058.00 | false | 0.038999;0.039131;0.039126;0.039103;0.039396 | 104957952;104957952;104957952;104957952;104957952 | 262144;262144;262144;262144;262144 | 154560;147520;149440;155392;146624 | |
53 | resnetv13_stage2_conv7_fwd | Convolution | [1,512,28,28] | 7221 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 501760 | 512.00 | 842.67 | 58.70 | 370.39 | 100.35 | false | 0.590806;0.586205;0.585316;0.588578;0.561622 | 501760;501760;501760;501760;501760 | 512;512;512;512;512 | 928;800;928;672;800 | |
54 | resnetv13_stage2_batchnorm7_fwd | BatchNorm | [1,128,28,28] | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 634880 | 2048.00 | 309397.33 | 17.70 | 2.04 | 126.98 | true | 0.177148;0.177734;0.177883;0.177045;0.177304 | 634880;634880;634880;634880;634880 | 2048;3072;2048;2048;2048 | 309440;309568;308448;309664;309184 | |
55 | resnetv13_stage2_relu4_fwd | Activation | [1,128,28,28] | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 200704 | 0.00 | 92032.00 | 47.90 | 2.18 | 43.00 | true | 0.479211;0.478724;0.480136;0.480073;0.479121 | 200704;200704;200704;200704;200704 | 512;0;0;0;0 | 92032;92160;93184;91904;91424 | |
56 | resnetv13_stage2_conv8_fwd | Convolution | [1,128,28,28] | 16525.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 140836864 | 0.00 | 401194.67 | 12.50 | 351.04 | 4023.91 | false | 0.124811;0.124813;0.124817;0.124812;0.124820 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;0 | 400896;402816;399872;403328;399360 | |
56 | resnetv13_stage2_conv8_fwd | Convolution | [1,128,28,28] | 16525.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 950272 | 589824.00 | 647690.67 | 9.80 | 0.77 | 190.05 | true | 0.097921;0.097731;0.097646;0.097609;0.097738 | 950272;950272;950272;950272;950272 | 589824;589824;589824;589824;589824 | 648096;646016;648960;645376;649728 | |
57 | resnetv13_stage2_batchnorm8_fwd | BatchNorm | [1,128,28,28] | 90.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 634880 | 2048.00 | 256.00 | 17.80 | 275.56 | 158.72 | false | 0.178053;0.177859;0.177854;0.178016;0.177263 | 634880;634880;634880;634880;634880 | 2048;2048;2048;2048;2048 | 256;256;256;256;256 | |
58 | resnetv13_stage2_relu5_fwd | Activation | [1,128,28,28] | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 200704 | 0.00 | 128.00 | 47.90 | 1568.00 | 50.18 | false | 0.478786;0.479672;0.479820;0.477514;0.480405 | 200704;200704;200704;200704;200704 | 0;0;0;0;0 | 0;256;128;256;0 | |
59 | resnetv13_stage2_conv9_fwd | Convolution | [1,128,28,28] | 6992 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 105259008 | 280938.67 | 1095125.33 | 11.90 | 76.49 | 4210.36 | false | 0.119271;0.117437;0.117966;0.118414;0.121312 | 105259008;105259008;105259008;105259008;105259008 | 275584;286016;278080;282624;282112 | 1085120;1110432;1091840;1097760;1095776 | |
59 | resnetv13_stage2_conv9_fwd | Convolution | [1,128,28,28] | 6992 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 2007040 | 2048.00 | 129994.67 | 70.90 | 15.20 | 286.72 | true | 0.705532;0.709389;0.708254;0.711920;0.708942 | 2007040;2007040;2007040;2007040;2007040 | 133440;122976;130976;127328;131680 | 2048;2048;2048;2304;2048 | |
60 | resnetv13_stage2_batchnorm9_fwd | BatchNorm | [1,512,28,28] | 103 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 2539520 | 8192.00 | 1158432.00 | 57.10 | 2.18 | 448.12 | true | 0.569056;0.573201;0.568459;0.569334;0.573782 | 2539520;2539520;2539520;2539520;2539520 | 1160288;1158144;1156864;1156000;1162976 | 8192;8192;8192;8192;8192 | |
61 | add_resnetv13_stage2_activation2 | add_relu | [1,512,28,28] | 151.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 6.00 | 401408 | 1373226.67 | 169130.67 | 58.50 | 0.26 | 66.90 | true | 0.578533;0.580650;0.594399;0.590112;0.584848 | 401408;401408;401408;401408;401408 | 1372416;1371904;1359104;1375360;1396736 | 170016;170624;168416;168960;162720 | |
62 | resnetv13_stage2_conv10_fwd | Convolution | [1,512,28,28] | 7052 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 51.00 | 104957952 | 262144.00 | 155520.00 | 3.90 | 251.30 | 2058.00 | false | 0.039085;0.039286;0.038871;0.039135;0.039138 | 104957952;104957952;104957952;104957952;104957952 | 262144;262400;262144;262144;262144 | 159040;151712;154944;157696;153920 | |
62 | resnetv13_stage2_conv10_fwd | Convolution | [1,512,28,28] | 7052 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 501760 | 512.00 | 757.33 | 58.60 | 395.29 | 100.35 | false | 0.589715;0.583726;0.586580;0.586500;0.580896 | 501760;501760;501760;501760;501760 | 2560;512;512;512;512 | 672;800;672;800;800 | |
63 | resnetv13_stage2_batchnorm10_fwd | BatchNorm | [1,128,28,28] | 39.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 634880 | 2048.00 | 263221.33 | 17.80 | 2.39 | 136.04 | true | 0.177515;0.179044;0.178961;0.177422;0.178179 | 634880;634880;634880;634880;634880 | 263584;262880;261856;263200;267840 | 2048;2048;2048;2048;2048 | |
64 | resnetv13_stage2_relu6_fwd | Activation | [1,128,28,28] | 34.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 200704 | 0.00 | 138048.00 | 47.90 | 1.45 | 50.18 | true | 0.478012;0.480113;0.477137;0.480403;0.477774 | 200704;200704;200704;200704;200704 | 0;0;0;0;2304 | 137280;138496;139296;138368;132640 | |
65 | resnetv13_stage2_conv11_fwd | Convolution | [1,128,28,28] | 16601.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.00 | 140836864 | 0.00 | 394965.33 | 12.50 | 356.58 | 4023.91 | false | 0.124816;0.124811;0.124817;0.124816;0.124817 | 140836864;140836864;140836864;140836864;140836864 | 0;0;0;0;0 | 398080;399360;397696;383776;389120 | |
65 | resnetv13_stage2_conv11_fwd | Convolution | [1,128,28,28] | 16601.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 950272 | 589824.00 | 650826.67 | 9.80 | 0.77 | 190.05 | true | 0.097654;0.097848;0.097458;0.098035;0.097654 | 950272;950272;950272;950272;950272 | 589824;589824;589824;589824;589824 | 650592;649344;650880;651008;655104 | |
66 | resnetv13_stage2_batchnorm11_fwd | BatchNorm | [1,128,28,28] | 87 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 634880 | 2048.00 | 256.00 | 17.80 | 275.56 | 126.98 | false | 0.177955;0.177745;0.177789;0.177994;0.177728 | 634880;634880;634880;634880;634880 | 2048;2048;2048;2048;2048 | 256;256;256;256;256 | |
67 | resnetv13_stage2_relu7_fwd | Activation | [1,128,28,28] | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 200704 | 0.00 | 256.00 | 47.90 | 784.00 | 43.00 | false | 0.478152;0.478960;0.478761;0.477809;0.480633 | 200704;200704;200704;200704;200704 | 0;0;0;0;0 | 256;256;384;256;256 | |
68 | resnetv13_stage2_conv12_fwd | Convolution | [1,128,28,28] | 6932 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 24.67 | 105259008 | 279317.33 | 1112362.67 | 11.90 | 75.63 | 4267.20 | false | 0.119943;0.121472;0.118560;0.117493;0.117832 | 105259008;105259008;105259008;105259008;105259008 | 276992;279872;274368;281088;287680 | 1101408;1106784;1135616;1105312;1124992 | |
68 | resnetv13_stage2_conv12_fwd | Convolution | [1,128,28,28] | 6932 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 2007040 | 2048.00 | 111306.67 | 69.80 | 17.71 | 286.72 | false | 0.693355;0.698661;0.694689;0.702187;0.699265 | 2007040;2007040;2007040;2007040;2007040 | 2048;2048;2048;2048;2048 | 119904;118112;82976;121312;95904 | |
69 | resnetv13_stage2_batchnorm12_fwd | BatchNorm | [1,512,28,28] | 101 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 2539520 | 8192.00 | 1153056.00 | 56.90 | 2.19 | 423.25 | true | 0.567294;0.569639;0.569901;0.566262;0.569443 | 2539520;2539520;2539520;2539520;2539520 | 8192;8192;13312;8192;8192 | 1154208;1150176;1153696;1154400;1151264 | |
70 | add_resnetv13_stage2_activation3 | add_relu | [1,512,28,28] | 152.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 6.00 | 401408 | 1178325.33 | 149216.00 | 57.00 | 0.30 | 66.90 | true | 0.557509;0.580829;0.563938;0.574100;0.571549 | 401408;401408;401408;401408;401408 | 1183616;1176320;1170816;1181184;1177472 | 152480;151776;149056;143264;146816 | |
71 | resnetv13_stage3_conv0_fwd | Convolution | [1,512,28,28] | 3696 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.00 | 58770432 | 524288.00 | 81866.67 | 3.10 | 96.96 | 1632.51 | false | 0.031246;0.031247;0.031246;0.031246;0.031246 | 58770432;58770432;58770432;58770432;58770432 | 83488;85792;81568;79648;80544 | 524288;524288;524544;524288;524288 | |
71 | resnetv13_stage3_conv0_fwd | Convolution | [1,512,28,28] | 3696 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 250880 | 1024.00 | 704.00 | 36.80 | 145.19 | 57.90 | false | 0.369942;0.368788;0.367203;0.366330;0.368836 | 250880;250880;250880;250880;250880 | 1024;1024;1024;1024;1024 | 672;768;672;640;800 | |
72 | resnetv13_stage3_batchnorm0_fwd | BatchNorm | [1,256,14,14] | 26.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 176277.33 | 8.70 | 1.76 | 63.49 | true | 0.086598;0.087954;0.086848;0.086856;0.085555 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 176320;176576;176192;176320;175968 | |
73 | resnetv13_stage3_relu0_fwd | Activation | [1,256,14,14] | 20.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 24576.00 | 35.10 | 4.08 | 25.09 | true | 0.351438;0.350135;0.350905;0.352588;0.351696 | 100352;100352;100352;100352;100352 | 0;0;0;0;0 | 24448;24448;24704;24576;24832 | |
74 | resnetv13_stage3_conv1_fwd | Convolution | [1,256,14,14] | 16042.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.33 | 140148736 | 60042.67 | 1276746.67 | 12.50 | 104.84 | 2178.49 | false | 0.124918;0.124919;0.124919;0.124919;0.124913 | 140148736;140148736;140148736;140148736;140148736 | 59008;60704;60416;56160;64000 | 1234368;1270976;1297792;1287328;1271936 | |
74 | resnetv13_stage3_conv1_fwd | Convolution | [1,256,14,14] | 16042.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 3801088 | 2359317.33 | 3108554.67 | 31.10 | 0.70 | 345.55 | true | 0.315651;0.310121;0.309428;0.310640;0.312816 | 3801088;3801088;3801088;3801088;3801088 | 2359296;2362624;2359296;2359360;2359296 | 3106080;3116512;3092544;3103072;3120928 | |
75 | resnetv13_stage3_batchnorm1_fwd | BatchNorm | [1,256,14,14] | 82 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4170.67 | 180757.33 | 8.70 | 1.72 | 63.49 | true | 0.087623;0.086684;0.087155;0.088481;0.086227 | 317440;317440;317440;317440;317440 | 4096;4256;4160;4256;4096 | 180768;180608;180896;181312;178208 | |
76 | resnetv13_stage3_relu1_fwd | Activation | [1,256,14,14] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 160.00 | 25685.33 | 37.80 | 3.88 | 25.09 | true | 0.378609;0.369005;0.377793;0.378450;0.382389 | 100352;100352;100352;100352;100352 | 128;224;128;224;128 | 25472;26368;25728;25408;25856 | |
77 | resnetv13_stage3_conv2_fwd | Convolution | [1,256,14,14] | 6844.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048618.67 | 539434.67 | 7.70 | 74.08 | 3921.37 | false | 0.077809;0.077087;0.076970;0.077033;0.077654 | 117641216;117641216;117641216;117641216;117641216 | 1048576;1048640;1048640;1048576;1048640 | 544992;535808;536896;541568;539840 | |
77 | resnetv13_stage3_conv2_fwd | Convolution | [1,256,14,14] | 6844.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.33 | 1003520 | 4096.00 | 63914.67 | 66.10 | 14.76 | 188.17 | true | 0.662363;0.656690;0.665063;0.656266;0.667405 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 58496;67808;65920;61952;63872 | |
78 | resnetv13_stage3_batchnorm2_fwd | BatchNorm | [1,1024,14,14] | 73 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 1269760 | 16384.00 | 555050.67 | 23.80 | 2.22 | 200.50 | true | 0.232534;0.236357;0.238863;0.239400;0.237478 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 554816;555264;554176;556352;555072 | |
79 | resnetv13_stage3_conv3_fwd | Convolution | [1,512,28,28] | 13330.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.67 | 235081728 | 3244032.00 | 1185344.00 | 8.50 | 53.07 | 4380.38 | false | 0.085290;0.085319;0.084854;0.085084;0.085130 | 235081728;235081728;235081728;235081728;235081728 | 3244032;3244032;3244032;3254528;3244032 | 1186048;1182976;1190464;1180864;1187008 | |
80 | resnetv13_stage3_batchnorm3_fwd | BatchNorm | [1,1024,14,14] | 126 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 1269760 | 16384.00 | 4330.67 | 24.00 | 61.30 | 211.63 | false | 0.237741;0.239881;0.239562;0.239423;0.242651 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 9856;1152;2624;9024;1344 | |
81 | add_resnetv13_stage3_activation0 | add_relu | [1,1024,14,14] | 85 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 0.00 | 1642.67 | 47.00 | 122.18 | 40.14 | false | 0.470519;0.471548;0.468560;0.468694;0.469469 | 200704;200704;200704;200704;200704 | 896;0;0;0;0 | 2720;160;1312;1536;2080 | |
82 | resnetv13_stage3_conv4_fwd | Convolution | [1,1024,14,14] | 6947.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.67 | 117490688 | 1048576.00 | 9194.67 | 3.10 | 111.07 | 1762.35 | false | 0.031248;0.031247;0.031247;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048576;1048576;1048576 | 11968;8896;7616;9280;9408 | |
82 | resnetv13_stage3_conv4_fwd | Convolution | [1,1024,14,14] | 6947.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 250880 | 1248.00 | 170.67 | 36.70 | 176.84 | 53.76 | false | 0.362026;0.367507;0.367423;0.367635;0.366281 | 250880;250880;250880;250880;250880 | 3552;1248;1248;1248;1248 | 256;0;256;0;256 | |
83 | resnetv13_stage3_batchnorm4_fwd | BatchNorm | [1,256,14,14] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 170.67 | 8.80 | 74.40 | 63.49 | false | 0.087548;0.087564;0.087552;0.087487;0.087697 | 317440;317440;317440;317440;317440 | 256;0;256;0;256 | 4096;4096;4096;4096;4096 | |
84 | resnetv13_stage3_relu2_fwd | Activation | [1,256,14,14] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 170.67 | 35.20 | 588.00 | 25.09 | false | 0.352190;0.349690;0.351188;0.352451;0.351589 | 100352;100352;100352;100352;100352 | 0;0;0;0;4352 | 256;0;256;0;256 | |
85 | resnetv13_stage3_conv5_fwd | Convolution | [1,256,14,14] | 15689 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.67 | 140148736 | 61941.33 | 1142624.00 | 12.50 | 116.35 | 2167.24 | false | 0.124921;0.124917;0.124917;0.124918;0.124918 | 140148736;140148736;140148736;140148736;140148736 | 68864;73824;44928;56544;60416 | 1130080;1125248;1215776;1152960;1144832 | |
85 | resnetv13_stage3_conv5_fwd | Convolution | [1,256,14,14] | 15689 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 3801088 | 2359360.00 | 2906432.00 | 30.00 | 0.72 | 380.11 | true | 0.300720;0.301315;0.299370;0.297624;0.301971 | 3801088;3801088;3801088;3801088;3801088 | 2917664;2922784;2830848;2897984;2903648 | 2359360;2359360;2359360;2359424;2359360 | |
86 | resnetv13_stage3_batchnorm5_fwd | BatchNorm | [1,256,14,14] | 83 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4117.33 | 186954.67 | 8.80 | 1.66 | 63.49 | true | 0.089021;0.087832;0.087831;0.088361;0.088273 | 317440;317440;317440;317440;317440 | 4096;4160;4096;4256;4096 | 186400;188608;187968;184800;186496 | |
87 | resnetv13_stage3_relu3_fwd | Activation | [1,256,14,14] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 160.00 | 21973.33 | 38.00 | 4.53 | 25.09 | true | 0.408322;0.369273;0.380760;0.377156;0.381267 | 100352;100352;100352;100352;100352 | 128;224;128;224;128 | 22656;22272;21888;21376;21760 | |
88 | resnetv13_stage3_conv6_fwd | Convolution | [1,256,14,14] | 6705.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 29.67 | 117641216 | 1048597.33 | 540533.33 | 7.70 | 74.03 | 3965.39 | false | 0.077087;0.077456;0.076934;0.076623;0.077569 | 117641216;117641216;117641216;117641216;117641216 | 1048576;1048640;1048576;1048640;1048576 | 544512;543776;537344;531488;540480 | |
88 | resnetv13_stage3_conv6_fwd | Convolution | [1,256,14,14] | 6705.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 1003520 | 4096.00 | 62528.00 | 67.10 | 15.06 | 200.70 | true | 0.671748;0.670117;0.663750;0.671627;0.674255 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 58240;59328;65408;72384;62848 | |
89 | resnetv13_stage3_batchnorm6_fwd | BatchNorm | [1,1024,14,14] | 60.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.67 | 1269760 | 16384.00 | 552042.67 | 23.30 | 2.23 | 190.45 | true | 0.233093;0.238286;0.235099;0.231111;0.225602 | 1269760;1269760;1269760;1269760;1269760 | 553024;550656;550976;554368;552128 | 16384;16384;16384;16384;16384 | |
90 | add_resnetv13_stage3_activation1 | add_relu | [1,1024,14,14] | 80.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 49781.33 | 56.70 | 0.24 | 40.14 | true | 0.560473;0.571170;0.563352;0.576598;0.567258 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 49440;52000;49312;47904;50592 | |
91 | resnetv13_stage3_conv7_fwd | Convolution | [1,1024,14,14] | 6587.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.00 | 117490688 | 1048576.00 | 24981.33 | 3.10 | 109.44 | 1780.16 | false | 0.031248;0.031247;0.031248;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1049344;1048576;1048576 | 24384;23232;24512;27840;26048 | |
91 | resnetv13_stage3_conv7_fwd | Convolution | [1,1024,14,14] | 6587.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 250880 | 1024.00 | 1024.00 | 36.60 | 122.50 | 62.72 | false | 0.363714;0.368033;0.366629;0.361633;0.367814 | 250880;250880;250880;250880;250880 | 1024;1024;1024;1024;1024 | 1024;1024;1664;1024;1024 | |
92 | resnetv13_stage3_batchnorm7_fwd | BatchNorm | [1,256,14,14] | 39.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 175669.33 | 8.60 | 1.77 | 63.49 | true | 0.087230;0.086659;0.086236;0.085223;0.086184 | 317440;317440;317440;317440;317440 | 5888;4096;4096;4096;4096 | 176032;175872;177152;174336;175104 | |
93 | resnetv13_stage3_relu4_fwd | Activation | [1,256,14,14] | 22.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 24576.00 | 35.20 | 4.08 | 25.09 | true | 0.351896;0.352911;0.350458;0.351950;0.352293 | 100352;100352;100352;100352;100352 | 24448;24704;24704;24576;24448 | 0;0;0;0;0 | |
94 | resnetv13_stage3_conv8_fwd | Convolution | [1,256,14,14] | 14945.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.00 | 140148736 | 63637.33 | 1058208.00 | 12.50 | 124.93 | 2189.82 | false | 0.124918;0.124921;0.124919;0.124922;0.124915 | 140148736;140148736;140148736;140148736;140148736 | 63104;67008;60672;71264;60800 | 1027008;1031200;1113696;1074912;1068512 | |
94 | resnetv13_stage3_conv8_fwd | Convolution | [1,256,14,14] | 14945.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.33 | 3801088 | 2359424.00 | 3284149.33 | 31.90 | 0.67 | 367.86 | true | 0.315832;0.317107;0.320093;0.319968;0.320728 | 3801088;3801088;3801088;3801088;3801088 | 2359552;2361216;2359360;2359360;2359360 | 3285088;3319136;3242944;3279840;3287520 | |
95 | resnetv13_stage3_batchnorm8_fwd | BatchNorm | [1,256,14,14] | 82.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4138.67 | 185013.33 | 8.90 | 1.68 | 63.49 | true | 0.089438;0.086096;0.087685;0.090220;0.088570 | 317440;317440;317440;317440;317440 | 185504;184288;184128;187008;185248 | 4096;4256;4096;4160;4160 | |
96 | resnetv13_stage3_relu5_fwd | Activation | [1,256,14,14] | 23.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 224.00 | 25664.00 | 37.70 | 3.88 | 25.09 | true | 0.385653;0.372972;0.371595;0.387735;0.372094 | 100352;100352;100352;100352;100352 | 224;224;128;224;224 | 25728;25664;25600;25344;25728 | |
97 | resnetv13_stage3_conv9_fwd | Convolution | [1,256,14,14] | 6842 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048576.00 | 543893.33 | 7.70 | 73.87 | 3921.37 | false | 0.076630;0.077112;0.076406;0.076956;0.077257 | 117641216;117641216;117641216;117641216;117641216 | 1048576;1048576;1048576;1048640;1048576 | 548352;543680;541312;539680;546688 | |
97 | resnetv13_stage3_conv9_fwd | Convolution | [1,256,14,14] | 6842 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 1003520 | 4096.00 | 59520.00 | 66.50 | 15.77 | 177.08 | true | 0.665811;0.675836;0.662814;0.665746;0.662149 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 54272;59776;62080;63840;56704 | |
98 | resnetv13_stage3_batchnorm9_fwd | BatchNorm | [1,1024,14,14] | 59.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1269760 | 16384.00 | 556266.67 | 23.40 | 2.22 | 181.39 | true | 0.233210;0.234602;0.235013;0.239997;0.232963 | 1269760;1269760;1269760;1269760;1269760 | 20736;16384;16384;16384;16384 | 556864;555712;556224;555648;557728 | |
99 | add_resnetv13_stage3_activation2 | add_relu | [1,1024,14,14] | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 45482.67 | 56.90 | 0.24 | 40.14 | true | 0.574799;0.568118;0.580889;0.564995;0.558019 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 45696;46848;45952;43904;44800 | |
100 | resnetv13_stage3_conv10_fwd | Convolution | [1,1024,14,14] | 5944 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.67 | 117490688 | 1048576.00 | 44565.33 | 3.10 | 107.48 | 1762.35 | false | 0.031248;0.031247;0.031248;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1053696;1048576;1048576;1048576;1048576 | 46656;43968;28096;44992;44736 | |
100 | resnetv13_stage3_conv10_fwd | Convolution | [1,1024,14,14] | 5944 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 250880 | 1024.00 | 1152.00 | 36.90 | 115.29 | 62.72 | false | 0.369287;0.376541;0.369016;0.369719;0.368670 | 250880;250880;250880;250880;250880 | 10752;1024;1152;1152;1152 | 1024;1024;1024;1024;1024 | |
101 | resnetv13_stage3_batchnorm10_fwd | BatchNorm | [1,256,14,14] | 25.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 170517.33 | 8.70 | 1.82 | 63.49 | true | 0.087014;0.086731;0.087269;0.087545;0.087306 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 162848;167744;175936;170464;173344 | |
102 | resnetv13_stage3_relu6_fwd | Activation | [1,256,14,14] | 20.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 100352 | 0.00 | 19850.67 | 35.20 | 5.06 | 23.16 | true | 0.352669;0.351900;0.351361;0.352189;0.351369 | 100352;100352;100352;100352;100352 | 19328;19744;20096;19712;22144 | 0;256;0;0;0 | |
103 | resnetv13_stage3_conv11_fwd | Convolution | [1,256,14,14] | 13493 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.00 | 140148736 | 59200.00 | 1095584.00 | 12.50 | 121.36 | 2189.82 | false | 0.124917;0.124920;0.124918;0.124920;0.124920 | 140148736;140148736;140148736;140148736;140148736 | 68288;68704;54400;53344;54912 | 1093760;1074976;1129920;1086304;1106688 | |
103 | resnetv13_stage3_conv11_fwd | Convolution | [1,256,14,14] | 13493 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.67 | 3801088 | 2359381.33 | 3238560.00 | 31.60 | 0.68 | 356.34 | true | 0.314491;0.316432;0.316016;0.316529;0.316157 | 3801088;3801088;3801088;3801088;3801088 | 3240608;3260320;3217184;3247936;3227136 | 2359296;2359360;2359488;2359360;2359424 | |
104 | resnetv13_stage3_batchnorm11_fwd | BatchNorm | [1,256,14,14] | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4128.00 | 190794.67 | 8.90 | 1.63 | 63.49 | true | 0.089120;0.089084;0.088186;0.089124;0.087712 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4192;4192 | 191264;190688;190432;191968;190048 | |
105 | resnetv13_stage3_relu7_fwd | Activation | [1,256,14,14] | 22.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 192.00 | 22144.00 | 38.70 | 4.49 | 25.09 | true | 0.381695;0.369004;0.393336;0.392208;0.385877 | 100352;100352;100352;100352;100352 | 128;224;224;224;128 | 22272;22656;22784;21120;21504 | |
106 | resnetv13_stage3_conv12_fwd | Convolution | [1,256,14,14] | 5834.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048576.00 | 537450.67 | 7.70 | 74.17 | 3921.37 | false | 0.077467;0.077506;0.077629;0.076921;0.077292 | 117641216;117641216;117641216;117641216;117641216 | 1048576;1048576;1048576;1048640;1048576 | 531424;537824;535616;540704;538912 | |
106 | resnetv13_stage3_conv12_fwd | Convolution | [1,256,14,14] | 5834.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 1003520 | 4096.00 | 65461.33 | 66.70 | 14.43 | 200.70 | true | 0.666097;0.660933;0.668970;0.670896;0.664966 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 71168;65184;67328;63296;63872 | |
107 | resnetv13_stage3_batchnorm12_fwd | BatchNorm | [1,1024,14,14] | 61 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.67 | 1269760 | 16384.00 | 553493.33 | 23.60 | 2.23 | 190.45 | true | 0.230356;0.240050;0.229595;0.236952;0.239624 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16640;16384 | 555072;554176;552512;549760;553792 | |
108 | add_resnetv13_stage3_activation3 | add_relu | [1,1024,14,14] | 79 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 49184.00 | 57.00 | 0.24 | 40.14 | true | 0.564462;0.559856;0.569272;0.580883;0.575134 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 48032;49056;49696;52768;48800 | |
109 | resnetv13_stage3_conv13_fwd | Convolution | [1,1024,14,14] | 5912.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.33 | 117490688 | 1048576.00 | 44693.33 | 3.10 | 107.47 | 1771.23 | false | 0.031248;0.031248;0.031247;0.031247;0.031247 | 117490688;117490688;117490688;117490688;117490688 | 42176;49984;50880;37568;41920 | 1048576;1048576;1048576;1048576;1048576 | |
109 | resnetv13_stage3_conv13_fwd | Convolution | [1,1024,14,14] | 5912.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 250880 | 1024.00 | 1109.33 | 36.80 | 117.60 | 62.72 | false | 0.360445;0.368377;0.369299;0.370649;0.367572 | 250880;250880;250880;250880;250880 | 1152;1024;1408;1152;1024 | 1024;1024;1024;1024;1024 | |
110 | resnetv13_stage3_batchnorm13_fwd | BatchNorm | [1,256,14,14] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 317440 | 4096.00 | 177568.00 | 8.70 | 1.75 | 68.02 | true | 0.087018;0.087377;0.086687;0.087312;0.085320 | 317440;317440;317440;317440;317440 | 4096;4096;14592;4096;4096 | 175712;178048;199648;176768;177888 | |
111 | resnetv13_stage3_relu8_fwd | Activation | [1,256,14,14] | 20.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 23808.00 | 35.20 | 4.22 | 25.09 | true | 0.351877;0.353116;0.350048;0.352422;0.351865 | 100352;100352;100352;100352;100352 | 23680;23936;23680;23808;24704 | 0;0;0;0;0 | |
112 | resnetv13_stage3_conv14_fwd | Convolution | [1,256,14,14] | 13457.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.67 | 140148736 | 71978.67 | 1057824.00 | 12.50 | 124.05 | 2167.24 | false | 0.124919;0.124917;0.124920;0.124918;0.124913 | 140148736;140148736;140148736;140148736;140148736 | 74112;71328;74240;62688;70496 | 1001504;1068576;1062752;1093920;1042144 | |
112 | resnetv13_stage3_conv14_fwd | Convolution | [1,256,14,14] | 13457.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 3801088 | 2359360.00 | 3268778.67 | 31.70 | 0.68 | 380.11 | true | 0.314786;0.315738;0.317064;0.318586;0.318481 | 3801088;3801088;3801088;3801088;3801088 | 3338592;3262208;3243776;3250336;3293792 | 2364544;2359360;2359360;2359296;2359360 | |
113 | resnetv13_stage3_batchnorm14_fwd | BatchNorm | [1,256,14,14] | 72.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4224.00 | 184448.00 | 8.80 | 1.68 | 63.49 | true | 0.090159;0.089332;0.087678;0.087836;0.087738 | 317440;317440;317440;317440;317440 | 186976;184384;184128;183168;184832 | 4096;4256;4256;4256;4160 | |
114 | resnetv13_stage3_relu9_fwd | Activation | [1,256,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 100352 | 192.00 | 25792.00 | 37.50 | 3.86 | 20.07 | true | 0.374099;0.372937;0.377163;0.374998;0.376548 | 100352;100352;100352;100352;100352 | 128;224;128;224;224 | 25664;25920;25728;25728;25984 | |
115 | resnetv13_stage3_conv15_fwd | Convolution | [1,256,14,14] | 5791.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048576.00 | 540917.33 | 7.70 | 74.01 | 3921.37 | false | 0.077824;0.076963;0.076631;0.077287;0.076765 | 117641216;117641216;117641216;117641216;117641216 | 1048576;1048576;1048576;1048704;1048576 | 538528;539200;541600;548640;541952 | |
115 | resnetv13_stage3_conv15_fwd | Convolution | [1,256,14,14] | 5791.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 1003520 | 4096.00 | 62336.00 | 66.10 | 15.11 | 177.08 | true | 0.645605;0.669000;0.659448;0.653821;0.668714 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 64256;64128;61440;54976;61440 | |
116 | resnetv13_stage3_batchnorm15_fwd | BatchNorm | [1,1024,14,14] | 56.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 1269760 | 16384.00 | 555509.33 | 23.50 | 2.22 | 200.50 | true | 0.231117;0.236639;0.237737;0.228758;0.239485 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 553696;555200;555296;556032;557248 | |
117 | add_resnetv13_stage3_activation4 | add_relu | [1,1024,14,14] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 46848.00 | 57.20 | 0.24 | 40.14 | true | 0.571170;0.571223;0.572484;0.569160;0.582290 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 48768;47360;46848;46336;46080 | |
118 | resnetv13_stage3_conv16_fwd | Convolution | [1,1024,14,14] | 5888 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.67 | 117490688 | 1048576.00 | 32064.00 | 3.10 | 108.72 | 1762.35 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1049856;1048576;1048576 | 23616;30016;42560;18048;57408 | |
118 | resnetv13_stage3_conv16_fwd | Convolution | [1,1024,14,14] | 5888 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 250880 | 1024.00 | 810.67 | 36.70 | 136.74 | 57.90 | false | 0.364770;0.368180;0.367663;0.366017;0.369242 | 250880;250880;250880;250880;250880 | 1024;1024;1024;1024;1024 | 384;0;1920;17920;128 | |
119 | resnetv13_stage3_batchnorm16_fwd | BatchNorm | [1,256,14,14] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 168138.67 | 8.70 | 1.84 | 63.49 | true | 0.087355;0.085333;0.087273;0.087449;0.086982 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 168992;172416;162464;173024;163008 | |
120 | resnetv13_stage3_relu10_fwd | Activation | [1,256,14,14] | 19.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 20405.33 | 35.10 | 4.92 | 25.09 | true | 0.351724;0.350445;0.350476;0.353287;0.350661 | 100352;100352;100352;100352;100352 | 0;0;0;0;0 | 21120;20640;19328;20096;20480 | |
121 | resnetv13_stage3_conv17_fwd | Convolution | [1,256,14,14] | 13466.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.67 | 140148736 | 68373.33 | 1075445.33 | 12.50 | 122.53 | 2167.24 | false | 0.124919;0.124916;0.124914;0.124916;0.124920 | 140148736;140148736;140148736;140148736;140148736 | 1006784;1071200;1113056;1099232;1055904 | 92160;57696;57824;55264;89600 | |
121 | resnetv13_stage3_conv17_fwd | Convolution | [1,256,14,14] | 13466.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.33 | 3801088 | 2359381.33 | 3256522.67 | 31.50 | 0.68 | 367.86 | true | 0.315622;0.312840;0.312753;0.316949;0.316581 | 3801088;3801088;3801088;3801088;3801088 | 2359360;2359424;2359360;2359296;2359424 | 3354912;3252224;3235904;3247360;3269984 | |
122 | resnetv13_stage3_batchnorm17_fwd | BatchNorm | [1,256,14,14] | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4202.67 | 192373.33 | 8.90 | 1.61 | 63.49 | true | 0.089199;0.089640;0.088639;0.088993;0.087242 | 317440;317440;317440;317440;317440 | 4096;4256;4096;4256;4256 | 193184;191680;190144;193376;192256 | |
123 | resnetv13_stage3_relu11_fwd | Activation | [1,256,14,14] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 224.00 | 22016.00 | 37.70 | 4.51 | 25.09 | true | 0.385919;0.373069;0.378504;0.377361;0.375830 | 100352;100352;100352;100352;100352 | 22144;22272;22144;19840;21760 | 224;352;224;224;128 | |
124 | resnetv13_stage3_conv18_fwd | Convolution | [1,256,14,14] | 5752.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048618.67 | 539466.67 | 7.70 | 74.08 | 3921.37 | false | 0.077347;0.076910;0.076869;0.077686;0.077561 | 117641216;117641216;117641216;117641216;117641216 | 1048576;1049024;1048576;1048704;1048576 | 543744;536256;533632;548544;538400 | |
124 | resnetv13_stage3_conv18_fwd | Convolution | [1,256,14,14] | 5752.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 1003520 | 4096.00 | 63722.67 | 66.70 | 14.80 | 200.70 | true | 0.660778;0.675953;0.664046;0.672180;0.663354 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 59008;66880;68992;55232;65280 | |
125 | resnetv13_stage3_batchnorm18_fwd | BatchNorm | [1,1024,14,14] | 63.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 1269760 | 16384.00 | 553344.00 | 24.30 | 2.23 | 211.63 | true | 0.246914;0.240759;0.244021;0.244397;0.239788 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 554048;551936;554048;556160;551616 | |
126 | add_resnetv13_stage3_activation5 | add_relu | [1,1024,14,14] | 89.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 49056.00 | 57.30 | 0.24 | 40.14 | true | 0.578199;0.572000;0.571576;0.563749;0.574754 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 48416;50464;48288;45472;50976 | |
127 | resnetv13_stage3_conv19_fwd | Convolution | [1,1024,14,14] | 5883.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.67 | 117490688 | 1048661.33 | 25578.67 | 3.10 | 109.37 | 1762.35 | false | 0.031247;0.031247;0.031248;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048576;1048832;1048832 | 28736;17728;30272;14080;57664 | |
127 | resnetv13_stage3_conv19_fwd | Convolution | [1,1024,14,14] | 5883.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 250880 | 1024.00 | 1280.00 | 36.50 | 108.89 | 50.18 | false | 0.369638;0.367871;0.358845;0.363405;0.364951 | 250880;250880;250880;250880;250880 | 1024;128;1024;1792;8832 | 1024;1024;5376;1024;1024 | |
128 | resnetv13_stage3_batchnorm19_fwd | BatchNorm | [1,256,14,14] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 168949.33 | 8.70 | 1.83 | 63.49 | true | 0.086567;0.086541;0.086534;0.087825;0.085149 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 168672;169984;176096;168192;164832 | |
129 | resnetv13_stage3_relu12_fwd | Activation | [1,256,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 23424.00 | 35.20 | 4.28 | 25.09 | true | 0.352239;0.351424;0.351571;0.352326;0.351919 | 100352;100352;100352;100352;100352 | 0;0;1792;0;0 | 23680;23040;23936;23424;23168 | |
130 | resnetv13_stage3_conv20_fwd | Convolution | [1,256,14,14] | 13439 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 65.00 | 140148736 | 66506.67 | 1062794.67 | 12.50 | 124.10 | 2156.13 | false | 0.124921;0.124920;0.124919;0.124918;0.124912 | 140148736;140148736;140148736;140148736;140148736 | 80512;59424;65792;71136;62592 | 1044032;1082432;998080;1100960;1061920 | |
130 | resnetv13_stage3_conv20_fwd | Convolution | [1,256,14,14] | 13439 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.33 | 3801088 | 2359381.33 | 3292618.67 | 31.60 | 0.67 | 367.86 | true | 0.309896;0.316240;0.317845;0.315798;0.316388 | 3801088;3801088;3801088;3801088;3801088 | 3316992;3283904;3333440;3276960;3266784 | 2359360;2359360;2362432;2359296;2359424 | |
131 | resnetv13_stage3_batchnorm20_fwd | BatchNorm | [1,256,14,14] | 74.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4170.67 | 184064.00 | 8.80 | 1.69 | 63.49 | true | 0.089665;0.088425;0.086610;0.088417;0.087726 | 317440;317440;317440;317440;317440 | 4096;4256;4096;6048;4160 | 183328;183936;185152;182272;184928 | |
132 | resnetv13_stage3_relu13_fwd | Activation | [1,256,14,14] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 192.00 | 25600.00 | 37.50 | 3.89 | 25.09 | true | 0.379241;0.370138;0.373135;0.386370;0.371421 | 100352;100352;100352;100352;100352 | 128;224;224;224;128 | 25472;25472;25856;25472;25920 | |
133 | resnetv13_stage3_conv21_fwd | Convolution | [1,256,14,14] | 5723.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048597.33 | 541322.67 | 7.70 | 73.99 | 3921.37 | false | 0.076921;0.077860;0.077463;0.077746;0.077157 | 117641216;117641216;117641216;117641216;117641216 | 1048576;1048640;1048576;1048704;1048576 | 537984;545664;546144;538752;539552 | |
133 | resnetv13_stage3_conv21_fwd | Convolution | [1,256,14,14] | 5723.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.33 | 1003520 | 4096.00 | 61162.67 | 66.90 | 15.38 | 188.17 | true | 0.672837;0.665097;0.670154;0.658586;0.670857 | 1003520;1003520;1003520;1003520;1003520 | 7168;4096;4096;4096;4096 | 62080;57664;56960;64832;63744 | |
134 | resnetv13_stage3_batchnorm21_fwd | BatchNorm | [1,1024,14,14] | 55.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1269760 | 16384.00 | 557002.67 | 23.70 | 2.21 | 181.39 | true | 0.240010;0.240206;0.234849;0.233872;0.236961 | 1269760;1269760;1269760;1269760;1269760 | 556096;558336;557728;556416;556864 | 16384;16384;16384;16384;16384 | |
135 | add_resnetv13_stage3_activation6 | add_relu | [1,1024,14,14] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 45482.67 | 57.30 | 0.24 | 40.14 | true | 0.572875;0.575408;0.572167;0.570752;0.580286 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;803328 | 46336;44544;44672;46080;45696 | |
136 | resnetv13_stage3_conv22_fwd | Convolution | [1,1024,14,14] | 6010.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.00 | 117490688 | 1048576.00 | 41728.00 | 3.10 | 107.76 | 1753.59 | false | 0.031248;0.031248;0.031247;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048576;1048576;1048576 | 30528;54976;53952;40704;26688 | |
136 | resnetv13_stage3_conv22_fwd | Convolution | [1,1024,14,14] | 6010.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 250880 | 1024.00 | 85.33 | 36.60 | 226.15 | 62.72 | false | 0.364255;0.365887;0.357778;0.367642;0.366526 | 250880;250880;250880;250880;250880 | 128;128;128;0;0 | 1024;1024;1024;1024;1024 | |
137 | resnetv13_stage3_batchnorm22_fwd | BatchNorm | [1,256,14,14] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 166709.33 | 8.70 | 1.86 | 63.49 | true | 0.086625;0.085596;0.086292;0.087267;0.088078 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 170016;164096;163872;166240;169792 | |
138 | resnetv13_stage3_relu14_fwd | Activation | [1,256,14,14] | 19.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 20234.67 | 35.10 | 4.96 | 25.09 | true | 0.351190;0.351227;0.349879;0.352131;0.351453 | 100352;100352;100352;100352;100352 | 0;0;0;0;0 | 20864;20512;19328;19328;21120 | |
139 | resnetv13_stage3_conv23_fwd | Convolution | [1,256,14,14] | 13517.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.67 | 140148736 | 71904.00 | 1082922.67 | 12.50 | 121.36 | 2167.24 | false | 0.124922;0.124918;0.124921;0.124918;0.124920 | 140148736;140148736;140148736;140148736;140148736 | 68224;59584;73472;74016;96000 | 1111968;1089984;1079232;1079552;1034816 | |
139 | resnetv13_stage3_conv23_fwd | Convolution | [1,256,14,14] | 13517.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 3801088 | 2359360.00 | 3254432.00 | 31.50 | 0.68 | 345.55 | true | 0.314713;0.313112;0.315678;0.315785;0.314978 | 3801088;3801088;3801088;3801088;3801088 | 2359360;2359360;2359360;2359296;2359360 | 3246336;3242240;3251968;3264992;3315648 | |
140 | resnetv13_stage3_batchnorm23_fwd | BatchNorm | [1,256,14,14] | 83.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4192.00 | 192138.67 | 8.80 | 1.62 | 63.49 | true | 0.089224;0.087658;0.087712;0.088486;0.089325 | 317440;317440;317440;317440;317440 | 4096;4256;4160;4256;4160 | 192672;190560;192160;192608;191648 | |
141 | resnetv13_stage3_relu15_fwd | Activation | [1,256,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 192.00 | 21802.67 | 37.60 | 4.56 | 25.09 | true | 0.386989;0.372802;0.373874;0.379382;0.374428 | 100352;100352;100352;100352;100352 | 224;2272;128;224;128 | 22016;21504;22016;20736;21888 | |
142 | resnetv13_stage3_conv24_fwd | Convolution | [1,256,14,14] | 5933.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048640.00 | 539690.67 | 7.70 | 74.07 | 3921.37 | false | 0.076862;0.076148;0.076786;0.076364;0.076973 | 117641216;117641216;117641216;117641216;117641216 | 1048576;1048640;1048576;1048704;1053696 | 532704;544672;547584;533376;541024 | |
142 | resnetv13_stage3_conv24_fwd | Convolution | [1,256,14,14] | 5933.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.33 | 1003520 | 4096.00 | 63648.00 | 66.50 | 14.81 | 188.17 | true | 0.671288;0.659525;0.671925;0.658306;0.665310 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 69760;58592;55808;70496;62592 | |
143 | resnetv13_stage3_batchnorm24_fwd | BatchNorm | [1,1024,14,14] | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.67 | 1269760 | 16384.00 | 553216.00 | 23.70 | 2.23 | 190.45 | true | 0.235028;0.244826;0.236538;0.238026;0.225335 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 551232;555136;553920;551808;553920 | |
144 | add_resnetv13_stage3_activation7 | add_relu | [1,1024,14,14] | 88.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 49269.33 | 56.40 | 0.24 | 40.14 | true | 0.562182;0.560718;0.556380;0.578616;0.569747 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;807936 | 51104;47264;48416;50720;48672 | |
145 | resnetv13_stage3_conv25_fwd | Convolution | [1,1024,14,14] | 5987.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.00 | 117490688 | 1048576.00 | 25600.00 | 3.10 | 109.38 | 1753.59 | false | 0.031248;0.031247;0.031248;0.031247;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 14272;22336;42176;25088;29376 | 1048576;1048576;1048576;1048576;1048576 | |
145 | resnetv13_stage3_conv25_fwd | Convolution | [1,1024,14,14] | 5987.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 250880 | 1024.00 | 298.67 | 36.70 | 189.68 | 53.76 | false | 0.365784;0.366134;0.368486;0.366563;0.367552 | 250880;250880;250880;250880;250880 | 0;512;128;256;1152 | 1024;1024;1024;1024;1024 | |
146 | resnetv13_stage3_batchnorm25_fwd | BatchNorm | [1,256,14,14] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4352.00 | 175200.00 | 8.70 | 1.77 | 63.49 | true | 0.086978;0.087097;0.087407;0.088480;0.086225 | 317440;317440;317440;317440;317440 | 175968;172928;174688;191104;174944 | 4096;4096;4096;4864;12544 | |
147 | resnetv13_stage3_relu16_fwd | Activation | [1,256,14,14] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 24448.00 | 35.20 | 4.10 | 25.09 | true | 0.352901;0.351687;0.351638;0.350963;0.351352 | 100352;100352;100352;100352;100352 | 24704;24320;24192;24320;24704 | 0;0;0;0;256 | |
148 | resnetv13_stage3_conv26_fwd | Convolution | [1,256,14,14] | 13547.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.33 | 140148736 | 65514.67 | 1055146.67 | 12.50 | 125.06 | 2178.49 | false | 0.124913;0.124922;0.124918;0.124919;0.124923 | 140148736;140148736;140148736;140148736;140148736 | 66304;88576;64608;65632;59904 | 1050304;959968;1051904;1063232;1073888 | |
148 | resnetv13_stage3_conv26_fwd | Convolution | [1,256,14,14] | 13547.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.33 | 3801088 | 2359381.33 | 3293973.33 | 31.50 | 0.67 | 367.86 | true | 0.316946;0.318445;0.311818;0.311501;0.317715 | 3801088;3801088;3801088;3801088;3801088 | 2359424;2359360;2359360;2359296;2359424 | 3315744;3396928;3286528;3279648;3274176 | |
149 | resnetv13_stage3_batchnorm26_fwd | BatchNorm | [1,256,14,14] | 76.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4181.33 | 183264.00 | 8.80 | 1.69 | 63.49 | true | 0.087151;0.087560;0.087376;0.089155;0.087901 | 317440;317440;317440;317440;317440 | 4096;4192;6048;4256;4096 | 183360;187936;182976;183424;183008 | |
150 | resnetv13_stage3_relu17_fwd | Activation | [1,256,14,14] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 1077.33 | 25696.00 | 37.50 | 3.75 | 25.09 | true | 0.377930;0.373250;0.372840;0.381051;0.371080 | 100352;100352;100352;100352;100352 | 224;224;5344;2784;128 | 25632;25600;25600;25856;27072 | |
151 | resnetv13_stage3_conv27_fwd | Convolution | [1,256,14,14] | 5772.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048661.33 | 539562.67 | 7.70 | 74.07 | 3921.37 | false | 0.078258;0.076937;0.077273;0.077301;0.076934 | 117641216;117641216;117641216;117641216;117641216 | 1048640;1048704;1048640;1048704;1048640 | 543456;542528;535584;533696;540576 | |
151 | resnetv13_stage3_conv27_fwd | Convolution | [1,256,14,14] | 5772.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 1003520 | 4096.00 | 63861.33 | 67.10 | 14.77 | 177.08 | true | 0.672684;0.664212;0.672935;0.666169;0.683319 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 59264;61024;67584;69984;62976 | |
152 | resnetv13_stage3_batchnorm27_fwd | BatchNorm | [1,1024,14,14] | 59 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.67 | 1269760 | 16384.00 | 557930.67 | 23.50 | 2.21 | 190.45 | true | 0.233362;0.234973;0.235872;0.235590;0.235800 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 555744;558208;558240;557568;558016 | |
153 | add_resnetv13_stage3_activation8 | add_relu | [1,1024,14,14] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 45952.00 | 57.30 | 0.24 | 40.14 | true | 0.574746;0.573653;0.569313;0.575735;0.568368 | 200704;200704;200704;200704;200704 | 812288;802816;802816;802816;802816 | 52224;45824;44032;44928;47104 | |
154 | resnetv13_stage3_conv28_fwd | Convolution | [1,1024,14,14] | 5879.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.67 | 117490688 | 1048576.00 | 44672.00 | 3.10 | 107.47 | 1762.35 | false | 0.031248;0.031248;0.031247;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048832;1048576;1048576 | 41728;37248;63360;20224;55040 | |
154 | resnetv13_stage3_conv28_fwd | Convolution | [1,1024,14,14] | 5879.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 250880 | 1024.00 | 1152.00 | 36.90 | 115.29 | 62.72 | false | 0.368707;0.367249;0.367739;0.369965;0.369110 | 250880;250880;250880;250880;250880 | 1024;1024;1024;1024;1024 | 1280;10880;1152;1024;1024 | |
155 | resnetv13_stage3_batchnorm28_fwd | BatchNorm | [1,256,14,14] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 159424.00 | 8.60 | 1.94 | 63.49 | true | 0.088044;0.086959;0.085848;0.085641;0.086215 | 317440;317440;317440;317440;317440 | 4096;4096;4096;9216;4096 | 167328;154752;154912;172768;156032 | |
156 | resnetv13_stage3_relu18_fwd | Activation | [1,256,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 18517.33 | 35.20 | 5.42 | 25.09 | true | 0.352130;0.352718;0.350874;0.351923;0.352407 | 100352;100352;100352;100352;100352 | 0;0;0;0;0 | 19072;17952;18432;19584;18048 | |
157 | resnetv13_stage3_conv29_fwd | Convolution | [1,256,14,14] | 13476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.67 | 140148736 | 73056.00 | 1088704.00 | 12.50 | 120.63 | 2167.24 | false | 0.124923;0.124921;0.124918;0.124920;0.124916 | 140148736;140148736;140148736;140148736;140148736 | 65696;74656;69248;97376;75264 | 1106304;1065824;1113664;1086240;1073568 | |
157 | resnetv13_stage3_conv29_fwd | Convolution | [1,256,14,14] | 13476 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.67 | 3801088 | 2359317.33 | 3255306.67 | 31.50 | 0.68 | 356.34 | true | 0.316943;0.314070;0.319136;0.310836;0.312796 | 3801088;3801088;3801088;3801088;3801088 | 2359296;2361344;2359296;2359296;2359360 | 3227424;3280064;3219552;3279456;3259040 | |
158 | resnetv13_stage3_batchnorm29_fwd | BatchNorm | [1,256,14,14] | 78 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 317440 | 4149.33 | 191744.00 | 8.90 | 1.62 | 68.02 | true | 0.089502;0.089381;0.088888;0.088301;0.087427 | 317440;317440;317440;317440;317440 | 4096;4256;4096;4256;4096 | 191776;191328;192128;190944;193184 | |
159 | resnetv13_stage3_relu19_fwd | Activation | [1,256,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 192.00 | 22016.00 | 38.10 | 4.52 | 25.09 | true | 0.382658;0.381203;0.375861;0.382177;0.379903 | 100352;100352;100352;100352;100352 | 224;224;128;224;128 | 22528;22528;22016;21248;21504 | |
160 | resnetv13_stage3_conv30_fwd | Convolution | [1,256,14,14] | 5758.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048682.67 | 538496.00 | 7.70 | 74.12 | 3921.37 | false | 0.077284;0.077793;0.077192;0.077180;0.077778 | 117641216;117641216;117641216;117641216;117641216 | 1058624;1048704;1048640;1048704;1048640 | 533984;530752;542336;539168;543200 | |
160 | resnetv13_stage3_conv30_fwd | Convolution | [1,256,14,14] | 5758.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 1003520 | 4096.00 | 64746.67 | 66.50 | 14.58 | 177.08 | true | 0.660721;0.664040;0.661984;0.668668;0.672274 | 1003520;1003520;1003520;1003520;1003520 | 4096;12288;4096;4096;4096 | 68608;72544;60800;64832;60032 | |
161 | resnetv13_stage3_batchnorm30_fwd | BatchNorm | [1,1024,14,14] | 57 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 1269760 | 16384.00 | 553621.33 | 24.00 | 2.23 | 200.50 | true | 0.232395;0.237035;0.243104;0.240138;0.242955 | 1269760;1269760;1269760;1269760;1269760 | 553664;554240;552000;552960;554432 | 16896;16384;16384;16384;16384 | |
162 | add_resnetv13_stage3_activation9 | add_relu | [1,1024,14,14] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 48842.67 | 57.20 | 0.24 | 40.14 | true | 0.579683;0.568031;0.574823;0.572903;0.560919 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 48800;48160;49696;49568;46112 | |
163 | resnetv13_stage3_conv31_fwd | Convolution | [1,1024,14,14] | 5883.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.33 | 117490688 | 1048576.00 | 13696.00 | 3.10 | 110.60 | 1771.23 | false | 0.031247;0.031248;0.031248;0.031247;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048576;1048576;1048576 | 19456;15232;22528;6400;5760 | |
163 | resnetv13_stage3_conv31_fwd | Convolution | [1,1024,14,14] | 5883.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 250880 | 1024.00 | 128.00 | 36.90 | 217.78 | 62.72 | false | 0.371727;0.368714;0.364919;0.369251;0.368236 | 250880;250880;250880;250880;250880 | 1024;1024;1024;1024;1024 | 128;128;256;128;0 | |
164 | resnetv13_stage3_batchnorm31_fwd | BatchNorm | [1,256,14,14] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 175381.33 | 8.70 | 1.77 | 63.49 | true | 0.086328;0.086855;0.086775;0.086605;0.087463 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 175584;175104;174816;178816;175456 | |
165 | resnetv13_stage3_relu20_fwd | Activation | [1,256,14,14] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 24832.00 | 35.20 | 4.04 | 25.09 | true | 0.352076;0.352549;0.360554;0.352331;0.349377 | 100352;100352;100352;100352;100352 | 0;0;0;0;0 | 24960;24832;25216;24576;24704 | |
166 | resnetv13_stage3_conv32_fwd | Convolution | [1,256,14,14] | 13448.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.33 | 140148736 | 72341.33 | 1064970.67 | 12.50 | 123.23 | 2178.49 | false | 0.124919;0.124920;0.124921;0.124918;0.124917 | 140148736;140148736;140148736;140148736;140148736 | 55936;69600;90880;69440;77984 | 1099712;1102752;988192;1062848;1032352 | |
166 | resnetv13_stage3_conv32_fwd | Convolution | [1,256,14,14] | 13448.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 3801088 | 2359317.33 | 3297514.67 | 31.70 | 0.67 | 345.55 | true | 0.316524;0.315999;0.318408;0.317237;0.318795 | 3801088;3801088;3801088;3801088;3801088 | 2359360;2359296;2359296;2359296;2360064 | 3263968;3265152;3370464;3302112;3325280 | |
167 | resnetv13_stage3_batchnorm32_fwd | BatchNorm | [1,256,14,14] | 74 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4256.00 | 185194.67 | 8.80 | 1.68 | 63.49 | true | 0.088994;0.087417;0.088099;0.089220;0.087391 | 317440;317440;317440;317440;317440 | 186016;182912;187072;184352;185216 | 4256;4256;4256;4256;4096 | |
168 | resnetv13_stage3_relu21_fwd | Activation | [1,256,14,14] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 224.00 | 25600.00 | 37.40 | 3.89 | 25.09 | true | 0.377404;0.370939;0.369124;0.404847;0.374561 | 100352;100352;100352;100352;100352 | 128;224;224;224;224 | 25600;25344;25728;25472;25728 | |
169 | resnetv13_stage3_conv33_fwd | Convolution | [1,256,14,14] | 5740.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048661.33 | 544341.33 | 7.70 | 73.85 | 3921.37 | false | 0.076737;0.077259;0.076825;0.077900;0.075995 | 117641216;117641216;117641216;117641216;117641216 | 1048640;1048704;1048640;1048704;1048576 | 550560;537408;547360;545312;540352 | |
169 | resnetv13_stage3_conv33_fwd | Convolution | [1,256,14,14] | 5740.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.33 | 1003520 | 4096.00 | 58538.67 | 66.20 | 16.02 | 188.17 | true | 0.661300;0.662530;0.659272;0.666008;0.661233 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 52736;66240;54656;58368;62592 | |
170 | resnetv13_stage3_batchnorm33_fwd | BatchNorm | [1,1024,14,14] | 55.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.67 | 1269760 | 16384.00 | 555093.33 | 23.20 | 2.22 | 190.45 | true | 0.229131;0.228953;0.235823;0.233570;0.234375 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16640 | 554720;555616;556960;554240;554944 | |
171 | add_resnetv13_stage3_activation10 | add_relu | [1,1024,14,14] | 88.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 48149.33 | 57.30 | 0.24 | 40.14 | true | 0.570923;0.569824;0.562346;0.578054;0.581154 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 47744;46848;54656;48192;48512 | |
172 | resnetv13_stage3_conv34_fwd | Convolution | [1,1024,14,14] | 5855 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.00 | 117490688 | 1048576.00 | 49152.00 | 3.10 | 107.03 | 1753.59 | false | 0.031247;0.031248;0.031248;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048576;1048576;1048576 | 32128;38656;53248;55552;57408 | |
172 | resnetv13_stage3_conv34_fwd | Convolution | [1,1024,14,14] | 5855 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 250880 | 1024.00 | 469.33 | 36.70 | 168.00 | 53.76 | false | 0.363645;0.366171;0.369291;0.367613;0.368429 | 250880;250880;250880;250880;250880 | 1024;1024;1024;1024;1024 | 128;512;512;1024;384 | |
173 | resnetv13_stage3_batchnorm34_fwd | BatchNorm | [1,256,14,14] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4181.33 | 164842.67 | 8.60 | 1.88 | 63.49 | true | 0.086318;0.086349;0.086613;0.087025;0.085876 | 317440;317440;317440;317440;317440 | 5376;4096;4352;4096;4096 | 168736;169376;160928;163424;162368 | |
174 | resnetv13_stage3_relu22_fwd | Activation | [1,256,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 20448.00 | 35.20 | 4.91 | 25.09 | true | 0.352453;0.350956;0.357295;0.351652;0.352410 | 100352;100352;100352;100352;100352 | 288;0;0;0;0 | 21248;21152;19584;19072;20608 | |
175 | resnetv13_stage3_conv35_fwd | Convolution | [1,256,14,14] | 13494.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.00 | 140148736 | 63701.33 | 1098730.67 | 12.50 | 120.57 | 2189.82 | false | 0.124921;0.124918;0.124915;0.124920;0.124919 | 140148736;140148736;140148736;140148736;140148736 | 90336;68960;51552;59744;62400 | 1059008;1108288;1091360;1096544;1110400 | |
175 | resnetv13_stage3_conv35_fwd | Convolution | [1,256,14,14] | 13494.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.67 | 3801088 | 2359317.33 | 3240128.00 | 31.50 | 0.68 | 356.34 | true | 0.315184;0.315879;0.309543;0.314229;0.315512 | 3801088;3801088;3801088;3801088;3801088 | 3281856;3240448;3239680;3240256;3216896 | 2360064;2359296;2359296;2359296;2359360 | |
176 | resnetv13_stage3_batchnorm35_fwd | BatchNorm | [1,256,14,14] | 75.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4202.67 | 191637.33 | 8.90 | 1.62 | 63.49 | true | 0.088955;0.089053;0.087626;0.089039;0.087574 | 317440;317440;317440;317440;317440 | 4256;4256;4096;4256;4096 | 194240;191808;191392;191712;190560 | |
177 | resnetv13_stage3_relu23_fwd | Activation | [1,256,14,14] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 192.00 | 22101.33 | 37.80 | 4.50 | 25.09 | true | 0.426895;0.369247;0.370721;0.387051;0.375301 | 100352;100352;100352;100352;100352 | 128;224;224;224;128 | 22400;22016;22656;21120;21888 | |
178 | resnetv13_stage3_conv36_fwd | Convolution | [1,256,14,14] | 5745 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 29.67 | 117641216 | 1048661.33 | 536938.67 | 7.70 | 74.19 | 3965.39 | false | 0.076932;0.077184;0.076963;0.077269;0.077961 | 117641216;117641216;117641216;117641216;117641216 | 1048704;1048704;1048576;1048960;1048576 | 530112;539840;530112;540864;544288 | |
178 | resnetv13_stage3_conv36_fwd | Convolution | [1,256,14,14] | 5745 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.33 | 1003520 | 4096.00 | 66186.67 | 66.50 | 14.28 | 188.17 | true | 0.669775;0.673560;0.663241;0.663133;0.660441 | 1003520;1003520;1003520;1003520;1003520 | 4352;4096;4096;4096;4096 | 73152;62944;72704;62912;58752 | |
179 | resnetv13_stage3_batchnorm36_fwd | BatchNorm | [1,1024,14,14] | 57 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 1269760 | 16384.00 | 553738.67 | 23.20 | 2.23 | 211.63 | true | 0.237889;0.233949;0.229526;0.231843;0.228699 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 554144;553984;552512;553088;554304 | |
180 | add_resnetv13_stage3_activation11 | add_relu | [1,1024,14,14] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 48714.67 | 56.40 | 0.24 | 40.14 | true | 0.574281;0.567335;0.559708;0.557538;0.565740 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 46496;48416;49824;49440;48288 | |
181 | resnetv13_stage3_conv37_fwd | Convolution | [1,1024,14,14] | 5895 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.67 | 117490688 | 1048576.00 | 26560.00 | 3.10 | 109.28 | 1762.35 | false | 0.031248;0.031248;0.031247;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1051648;1048576;1048576;1048576;1048576 | 32512;21632;41152;21376;25536 | |
181 | resnetv13_stage3_conv37_fwd | Convolution | [1,1024,14,14] | 5895 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 250880 | 1024.00 | 554.67 | 36.90 | 158.92 | 53.76 | false | 0.369439;0.369399;0.368913;0.367412;0.363917 | 250880;250880;250880;250880;250880 | 1024;1024;1024;1024;1024 | 12544;384;128;1152;0 | |
182 | resnetv13_stage3_batchnorm37_fwd | BatchNorm | [1,256,14,14] | 25.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 177600.00 | 8.60 | 1.75 | 63.49 | true | 0.085409;0.085860;0.086632;0.085104;0.087155 | 317440;317440;317440;317440;317440 | 177984;177792;175328;177024;188512 | 4096;4096;4096;4096;4096 | |
183 | resnetv13_stage3_relu24_fwd | Activation | [1,256,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 24533.33 | 35.20 | 4.09 | 25.09 | true | 0.352362;0.351615;0.351810;0.352610;0.351353 | 100352;100352;100352;100352;100352 | 24448;24448;24064;25088;24704 | 0;0;0;0;0 | |
184 | resnetv13_stage3_conv38_fwd | Convolution | [1,256,14,14] | 13492.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.33 | 140148736 | 62314.67 | 1095946.67 | 12.50 | 121.00 | 2178.49 | false | 0.124918;0.124921;0.124917;0.124912;0.124918 | 140148736;140148736;140148736;140148736;140148736 | 66528;80096;54912;55904;64512 | 1084224;1074656;1128960;1139776;1065120 | |
184 | resnetv13_stage3_conv38_fwd | Convolution | [1,256,14,14] | 13492.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 3801088 | 2359360.00 | 3246645.33 | 31.80 | 0.68 | 380.11 | true | 0.316803;0.318173;0.318792;0.313481;0.318293 | 3801088;3801088;3801088;3801088;3801088 | 2359360;2359296;2359360;2359360;2359424 | 3245856;3280256;3214880;3219264;3274816 | |
185 | resnetv13_stage3_batchnorm38_fwd | BatchNorm | [1,256,14,14] | 77 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4181.33 | 183818.67 | 8.80 | 1.69 | 63.49 | true | 0.089317;0.086556;0.087313;0.088145;0.087753 | 317440;317440;317440;317440;317440 | 4256;4256;4096;4192;4096 | 184768;185216;182464;182400;184224 | |
186 | resnetv13_stage3_relu25_fwd | Activation | [1,256,14,14] | 20.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 192.00 | 25578.67 | 37.80 | 3.89 | 25.09 | true | 0.374663;0.369834;0.380797;0.384017;0.378235 | 100352;100352;100352;100352;100352 | 224;224;128;5344;128 | 25536;25472;25728;25984;25472 | |
187 | resnetv13_stage3_conv39_fwd | Convolution | [1,256,14,14] | 5732 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 29.33 | 117641216 | 1048618.67 | 544597.33 | 7.70 | 73.84 | 4010.54 | false | 0.076542;0.077879;0.077547;0.077234;0.077392 | 117641216;117641216;117641216;117641216;117641216 | 1048640;1048640;1048576;1048704;1048576 | 537280;531744;548192;548640;548320 | |
187 | resnetv13_stage3_conv39_fwd | Convolution | [1,256,14,14] | 5732 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 1003520 | 4096.00 | 58762.67 | 67.10 | 15.96 | 200.70 | true | 0.668319;0.667855;0.679416;0.675099;0.669191 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 65920;71584;55168;55200;55168 | |
188 | resnetv13_stage3_batchnorm39_fwd | BatchNorm | [1,1024,14,14] | 59.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 1269760 | 16384.00 | 557056.00 | 23.60 | 2.21 | 200.50 | true | 0.240704;0.232958;0.237125;0.236059;0.234374 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 557856;556192;557504;556288;557376 | |
189 | add_resnetv13_stage3_activation12 | add_relu | [1,1024,14,14] | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 45290.67 | 57.40 | 0.24 | 40.14 | true | 0.573068;0.564184;0.577261;0.579899;0.572720 | 200704;200704;200704;200704;200704 | 802816;802816;803072;802816;802816 | 44544;46336;43008;46144;45184 | |
190 | resnetv13_stage3_conv40_fwd | Convolution | [1,1024,14,14] | 5890 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.00 | 117490688 | 1048576.00 | 35498.67 | 3.10 | 108.38 | 1753.59 | false | 0.031248;0.031248;0.031247;0.031248;0.031247 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048576;1048576;1048576 | 29696;51200;40256;20480;36544 | |
190 | resnetv13_stage3_conv40_fwd | Convolution | [1,1024,14,14] | 5890 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 250880 | 1024.00 | 2560.00 | 36.80 | 70.00 | 53.76 | false | 0.367457;0.368390;0.364324;0.369144;0.369537 | 250880;250880;250880;250880;250880 | 2816;1024;1024;1024;1024 | 6400;1280;10752;0;0 | |
191 | resnetv13_stage3_batchnorm40_fwd | BatchNorm | [1,256,14,14] | 25.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 317440 | 4096.00 | 172245.33 | 8.60 | 1.80 | 68.02 | true | 0.086138;0.088790;0.086356;0.086872;0.085791 | 317440;317440;317440;317440;317440 | 178368;166400;162240;179552;171968 | 4096;4096;4096;4096;4096 | |
192 | resnetv13_stage3_relu26_fwd | Activation | [1,256,14,14] | 20.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 19882.67 | 35.10 | 5.05 | 25.09 | true | 0.351613;0.351506;0.350019;0.351187;0.352426 | 100352;100352;100352;100352;100352 | 0;1280;0;0;0 | 20224;18848;19712;20096;19840 | |
193 | resnetv13_stage3_conv41_fwd | Convolution | [1,256,14,14] | 13465.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.00 | 140148736 | 70272.00 | 1098954.67 | 12.50 | 119.86 | 2189.82 | false | 0.124922;0.124922;0.124919;0.124920;0.124918 | 140148736;140148736;140148736;140148736;140148736 | 71392;57696;77760;61664;80000 | 1062496;1110432;1103456;1110080;1083328 | |
193 | resnetv13_stage3_conv41_fwd | Convolution | [1,256,14,14] | 13465.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.33 | 3801088 | 2360021.33 | 3245984.00 | 31.30 | 0.68 | 367.86 | true | 0.313566;0.315954;0.316234;0.309846;0.309778 | 3801088;3801088;3801088;3801088;3801088 | 2366272;2359296;2361408;2359296;2359360 | 3270528;3223328;3233312;3244704;3259936 | |
194 | resnetv13_stage3_batchnorm41_fwd | BatchNorm | [1,256,14,14] | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4202.67 | 192000.00 | 8.80 | 1.62 | 63.49 | true | 0.088637;0.087598;0.087890;0.088400;0.088361 | 317440;317440;317440;317440;317440 | 4256;4160;4096;4256;4192 | 193856;192704;190368;189664;192928 | |
195 | resnetv13_stage3_relu27_fwd | Activation | [1,256,14,14] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 224.00 | 21290.67 | 38.00 | 4.66 | 25.09 | true | 0.386591;0.369836;0.369015;0.386426;0.385197 | 100352;100352;100352;100352;100352 | 224;224;224;224;128 | 22208;20608;22144;20864;20864 | |
196 | resnetv13_stage3_conv42_fwd | Convolution | [1,256,14,14] | 5774.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048640.00 | 537482.67 | 7.70 | 74.17 | 3921.37 | false | 0.078235;0.076555;0.076750;0.077136;0.077282 | 117641216;117641216;117641216;117641216;117641216 | 1048640;1048704;1048576;1048704;1048576 | 529632;543136;539136;539072;534240 | |
196 | resnetv13_stage3_conv42_fwd | Convolution | [1,256,14,14] | 5774.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.33 | 1003520 | 4096.00 | 65909.33 | 65.70 | 14.33 | 188.17 | true | 0.661349;0.663461;0.655816;0.655248;0.650216 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 72608;60256;64000;64736;68992 | |
197 | resnetv13_stage3_batchnorm42_fwd | BatchNorm | [1,1024,14,14] | 56 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1269760 | 16384.00 | 554090.67 | 23.60 | 2.23 | 181.39 | true | 0.231724;0.237463;0.231861;0.238713;0.238616 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;17664;16384 | 553056;554624;554816;553088;554560 | |
198 | add_resnetv13_stage3_activation13 | add_relu | [1,1024,14,14] | 85 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 48416.00 | 56.80 | 0.24 | 40.14 | true | 0.568357;0.574902;0.567721;0.567670;0.566386 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 49440;47776;47520;49440;48032 | |
199 | resnetv13_stage3_conv43_fwd | Convolution | [1,1024,14,14] | 5892 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.00 | 117490688 | 1048576.00 | 34090.67 | 3.10 | 108.52 | 1753.59 | false | 0.031247;0.031247;0.031247;0.031247;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048576;1048576;1050368 | 32640;41856;39104;26624;30528 | |
199 | resnetv13_stage3_conv43_fwd | Convolution | [1,1024,14,14] | 5892 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 250880 | 1024.00 | 170.67 | 36.70 | 210.00 | 57.90 | false | 0.365369;0.366016;0.370824;0.366021;0.369414 | 250880;250880;250880;250880;250880 | 6144;1024;1024;1024;1024 | 256;128;128;128;12928 | |
200 | resnetv13_stage3_batchnorm43_fwd | BatchNorm | [1,256,14,14] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4181.33 | 178218.67 | 8.60 | 1.74 | 63.49 | true | 0.085084;0.085921;0.085761;0.087187;0.085302 | 317440;317440;317440;317440;317440 | 4352;4352;4096;4096;4096 | 190144;194176;172000;171136;172512 | |
201 | resnetv13_stage3_relu28_fwd | Activation | [1,256,14,14] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 23893.33 | 35.10 | 4.20 | 25.09 | true | 0.350818;0.352572;0.349925;0.350576;0.351783 | 100352;100352;100352;100352;100352 | 0;0;0;0;0 | 23296;23808;24192;23808;24064 | |
202 | resnetv13_stage3_conv44_fwd | Convolution | [1,256,14,14] | 13452 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.00 | 140148736 | 62730.67 | 1083072.00 | 12.50 | 122.31 | 2189.82 | false | 0.124921;0.124918;0.124923;0.124919;0.124922 | 140148736;140148736;140148736;140148736;140148736 | 1074592;1101824;1072800;1117664;1045856 | 65760;56032;65632;56800;84608 | |
202 | resnetv13_stage3_conv44_fwd | Convolution | [1,256,14,14] | 13452 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.33 | 3801088 | 2359338.67 | 3260010.67 | 31.60 | 0.68 | 367.86 | true | 0.313797;0.317766;0.318232;0.315225;0.314813 | 3801088;3801088;3801088;3801088;3801088 | 3262656;3223328;3273888;3243488;3296672 | 2359296;2359296;2359424;2359296;2359552 | |
203 | resnetv13_stage3_batchnorm44_fwd | BatchNorm | [1,256,14,14] | 74 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4256.00 | 183157.33 | 8.90 | 1.69 | 63.49 | true | 0.088435;0.088849;0.088270;0.089130;0.087596 | 317440;317440;317440;317440;317440 | 183040;183808;184576;181760;182624 | 4256;4256;4256;4256;4096 | |
204 | resnetv13_stage3_relu29_fwd | Activation | [1,256,14,14] | 23.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 224.00 | 25536.00 | 37.60 | 3.90 | 25.09 | true | 0.373549;0.370549;0.369675;0.384973;0.400584 | 100352;100352;100352;100352;100352 | 224;224;224;224;224 | 25472;25472;25792;25472;25664 | |
205 | resnetv13_stage3_conv45_fwd | Convolution | [1,256,14,14] | 5739.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048640.00 | 538613.33 | 7.70 | 74.12 | 3921.37 | false | 0.077507;0.076542;0.077895;0.077052;0.076678 | 117641216;117641216;117641216;117641216;117641216 | 1048640;1048704;1048576;1048704;1048576 | 536288;543680;528960;546592;535872 | |
205 | resnetv13_stage3_conv45_fwd | Convolution | [1,256,14,14] | 5739.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 1003520 | 4096.00 | 64533.33 | 66.10 | 14.62 | 200.70 | true | 0.659840;0.659405;0.662407;0.660296;0.666404 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 66816;59968;73856;57152;66816 | |
206 | resnetv13_stage3_batchnorm45_fwd | BatchNorm | [1,1024,14,14] | 55.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1269760 | 16384.00 | 558122.67 | 23.70 | 2.21 | 181.39 | true | 0.240491;0.234700;0.236899;0.239332;0.230398 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 558272;560896;553792;558592;557504 | |
207 | add_resnetv13_stage3_activation14 | add_relu | [1,1024,14,14] | 78 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 44330.67 | 57.30 | 0.24 | 40.14 | true | 0.570739;0.573829;0.574923;0.571026;0.579496 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 44032;41600;48512;43904;45056 | |
208 | resnetv13_stage3_conv46_fwd | Convolution | [1,1024,14,14] | 5906.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.33 | 117490688 | 1048576.00 | 42453.33 | 3.10 | 107.69 | 1771.23 | false | 0.031248;0.031248;0.031247;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048576;1048576;1048576 | 39040;38656;36032;49664;52800 | |
208 | resnetv13_stage3_conv46_fwd | Convolution | [1,1024,14,14] | 5906.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 250880 | 1024.00 | 128.00 | 36.10 | 217.78 | 62.72 | false | 0.360153;0.357055;0.353136;0.364835;0.364887 | 250880;250880;250880;250880;250880 | 1024;1024;2304;1024;1024 | 0;0;32000;128;256 | |
209 | resnetv13_stage3_batchnorm46_fwd | BatchNorm | [1,256,14,14] | 23.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 163317.33 | 8.60 | 1.90 | 63.49 | true | 0.086046;0.084514;0.085735;0.086299;0.085264 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 164160;164288;155712;164192;161600 | |
210 | resnetv13_stage3_relu30_fwd | Activation | [1,256,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 19637.33 | 35.10 | 5.11 | 25.09 | true | 0.351094;0.351239;0.350383;0.352268;0.351404 | 100352;100352;100352;100352;100352 | 0;0;0;0;0 | 19328;20000;19328;19584;20224 | |
211 | resnetv13_stage3_conv47_fwd | Convolution | [1,256,14,14] | 13513.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.00 | 140148736 | 72938.67 | 1093994.67 | 12.50 | 120.10 | 2189.82 | false | 0.124917;0.124913;0.124920;0.124918;0.124915 | 140148736;140148736;140148736;140148736;140148736 | 90592;93632;65920;62304;59392 | 1074176;1066592;1117856;1099104;1108704 | |
211 | resnetv13_stage3_conv47_fwd | Convolution | [1,256,14,14] | 13513.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 3801088 | 2359317.33 | 3251104.00 | 31.50 | 0.68 | 380.11 | true | 0.314749;0.314751;0.316978;0.317321;0.314613 | 3801088;3801088;3801088;3801088;3801088 | 2359296;2359296;2359360;2359296;2359360 | 3281248;3284000;3216896;3241920;3230144 | |
212 | resnetv13_stage3_batchnorm47_fwd | BatchNorm | [1,256,14,14] | 83 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4256.00 | 191306.67 | 8.80 | 1.62 | 63.49 | true | 0.087822;0.087384;0.088280;0.089224;0.088940 | 317440;317440;317440;317440;317440 | 188864;192352;190816;192608;190752 | 4256;4256;6656;4256;4096 | |
213 | resnetv13_stage3_relu31_fwd | Activation | [1,256,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 160.00 | 22058.67 | 37.60 | 4.52 | 25.09 | true | 0.382757;0.373555;0.372742;0.386258;0.372686 | 100352;100352;100352;100352;100352 | 224;128;128;224;128 | 22400;22528;22144;20864;21632 | |
214 | resnetv13_stage3_conv48_fwd | Convolution | [1,256,14,14] | 5757 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 29.67 | 117641216 | 1048661.33 | 540469.33 | 7.70 | 74.03 | 3965.39 | false | 0.077063;0.076492;0.077206;0.077447;0.077080 | 117641216;117641216;117641216;117641216;117641216 | 1048704;1048640;1048640;1048704;1048576 | 540864;539904;536512;541824;540640 | |
214 | resnetv13_stage3_conv48_fwd | Convolution | [1,256,14,14] | 5757 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 1003520 | 4096.00 | 62677.33 | 66.50 | 15.03 | 200.70 | true | 0.672857;0.650413;0.663602;0.666128;0.665014 | 1003520;1003520;1003520;1003520;1003520 | 62304;63264;66304;62144;62464 | 4096;9216;4096;4096;4096 | |
215 | resnetv13_stage3_batchnorm48_fwd | BatchNorm | [1,1024,14,14] | 57 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.67 | 1269760 | 16384.00 | 553013.33 | 23.50 | 2.23 | 190.45 | true | 0.234808;0.239654;0.242147;0.231564;0.231157 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 554144;551104;552512;554880;552384 | |
216 | add_resnetv13_stage3_activation15 | add_relu | [1,1024,14,14] | 87.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 49397.33 | 57.50 | 0.24 | 40.14 | true | 0.581037;0.560664;0.581778;0.578247;0.567036 | 200704;200704;200704;200704;200704 | 802816;803072;802816;802816;802816 | 48288;50208;49696;47520;50208 | |
217 | resnetv13_stage3_conv49_fwd | Convolution | [1,1024,14,14] | 5895.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.67 | 117490688 | 1048576.00 | 29802.67 | 3.10 | 108.95 | 1762.35 | false | 0.031247;0.031247;0.031248;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 40960;32000;26624;24960;30784 | 1048576;1048576;1048576;1053952;1048576 | |
217 | resnetv13_stage3_conv49_fwd | Convolution | [1,1024,14,14] | 5895.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 250880 | 1024.00 | 2858.67 | 35.80 | 64.62 | 53.76 | false | 0.351365;0.357863;0.367567;0.364210;0.352190 | 250880;250880;250880;250880;250880 | 1024;1024;1024;1024;1024 | 0;8064;384;10240;128 | |
218 | resnetv13_stage3_batchnorm49_fwd | BatchNorm | [1,256,14,14] | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 178037.33 | 8.70 | 1.74 | 63.49 | true | 0.085765;0.086488;0.087630;0.086989;0.086377 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 178496;178176;178144;177792;176096 | |
219 | resnetv13_stage3_relu32_fwd | Activation | [1,256,14,14] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 24618.67 | 35.10 | 4.08 | 25.09 | true | 0.351340;0.354642;0.350372;0.351173;0.351665 | 100352;100352;100352;100352;100352 | 0;0;0;0;0 | 24448;25088;24832;24576;24448 | |
220 | resnetv13_stage3_conv50_fwd | Convolution | [1,256,14,14] | 13486 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.33 | 140148736 | 74965.33 | 1036138.67 | 12.50 | 126.13 | 2178.49 | false | 0.124916;0.124922;0.124918;0.124919;0.124916 | 140148736;140148736;140148736;140148736;140148736 | 56928;68768;74880;81248;94304 | 1065024;1015840;1036288;1056288;981280 | |
220 | resnetv13_stage3_conv50_fwd | Convolution | [1,256,14,14] | 13486 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 3801088 | 2360234.67 | 3302336.00 | 31.80 | 0.67 | 380.11 | true | 0.320783;0.318038;0.319234;0.317783;0.317916 | 3801088;3801088;3801088;3801088;3801088 | 2361856;2368256;2359360;2359296;2359488 | 3269312;3316672;3305856;3284480;3364608 | |
221 | resnetv13_stage3_batchnorm50_fwd | BatchNorm | [1,256,14,14] | 75 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4234.67 | 185685.33 | 8.90 | 1.67 | 63.49 | true | 0.089184;0.087627;0.088245;0.088899;0.089044 | 317440;317440;317440;317440;317440 | 4256;4192;4160;4256;4256 | 186112;185856;185088;184192;187072 | |
222 | resnetv13_stage3_relu33_fwd | Activation | [1,256,14,14] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 224.00 | 25536.00 | 37.60 | 3.90 | 25.09 | true | 0.374047;0.381565;0.371638;0.381753;0.372666 | 100352;100352;100352;100352;100352 | 25472;25472;26304;25472;25664 | 224;224;128;224;224 | |
223 | resnetv13_stage3_conv51_fwd | Convolution | [1,256,14,14] | 5751 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 29.67 | 117641216 | 1048682.67 | 538976.00 | 7.70 | 74.10 | 3965.39 | false | 0.077450;0.076882;0.076855;0.077319;0.077590 | 117641216;117641216;117641216;117641216;117641216 | 1050496;1048704;1048640;1048704;1048576 | 545984;537408;540992;538528;536000 | |
223 | resnetv13_stage3_conv51_fwd | Convolution | [1,256,14,14] | 5751 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 1003520 | 4096.00 | 64480.00 | 65.70 | 14.63 | 177.08 | true | 0.657792;0.648108;0.665776;0.645047;0.670551 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 56128;66240;62208;64992;67456 | |
224 | resnetv13_stage3_batchnorm51_fwd | BatchNorm | [1,1024,14,14] | 56 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1269760 | 16384.00 | 556288.00 | 23.60 | 2.22 | 181.39 | true | 0.233126;0.234882;0.239238;0.238762;0.234777 | 1269760;1269760;1269760;1269760;1269760 | 554848;556544;555200;557184;557120 | 16384;16384;16384;16384;20736 | |
225 | add_resnetv13_stage3_activation16 | add_relu | [1,1024,14,14] | 84.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 46080.00 | 58.00 | 0.24 | 40.14 | true | 0.578920;0.580062;0.581512;0.562529;0.587079 | 200704;200704;200704;200704;200704 | 802816;804608;802816;802816;802816 | 47616;45952;47104;45184;42752 | |
226 | resnetv13_stage3_conv52_fwd | Convolution | [1,1024,14,14] | 5878.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.00 | 117490688 | 1048576.00 | 35712.00 | 3.10 | 108.36 | 1753.59 | false | 0.031248;0.031248;0.031248;0.031247;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048576;1048576;1048576 | 38784;48640;33408;34944;27968 | |
226 | resnetv13_stage3_conv52_fwd | Convolution | [1,1024,14,14] | 5878.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 250880 | 1024.00 | 384.00 | 36.20 | 178.18 | 57.90 | false | 0.361988;0.363319;0.360805;0.359454;0.368797 | 250880;250880;250880;250880;250880 | 128;0;128;9984;896 | 1024;1024;1024;1024;1024 | |
227 | resnetv13_stage3_batchnorm52_fwd | BatchNorm | [1,256,14,14] | 25.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 168384.00 | 8.60 | 1.84 | 63.49 | true | 0.085177;0.086663;0.085951;0.086554;0.086138 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 167616;164160;173376;163680;173376 | |
228 | resnetv13_stage3_relu34_fwd | Activation | [1,256,14,14] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 20362.67 | 35.20 | 4.93 | 25.09 | true | 0.351222;0.353045;0.350107;0.351215;0.352467 | 100352;100352;100352;100352;100352 | 0;0;256;0;0 | 20352;20256;20480;19200;20736 | |
229 | resnetv13_stage3_conv53_fwd | Convolution | [1,256,14,14] | 13435.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.67 | 140148736 | 63360.00 | 1084832.00 | 12.50 | 122.06 | 2167.24 | false | 0.124917;0.124920;0.124921;0.124918;0.124915 | 140148736;140148736;140148736;140148736;140148736 | 1008704;1108288;1078176;1104224;1072096 | 116064;52608;71680;52448;65792 | |
229 | resnetv13_stage3_conv53_fwd | Convolution | [1,256,14,14] | 13435.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 3801088 | 2359338.67 | 3261290.67 | 31.60 | 0.68 | 345.55 | true | 0.315621;0.315988;0.316943;0.315473;0.315108 | 3801088;3801088;3801088;3801088;3801088 | 2359296;2359296;2359296;2359552;2359424 | 3333408;3224032;3268800;3243648;3271424 | |
230 | resnetv13_stage3_batchnorm53_fwd | BatchNorm | [1,256,14,14] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4234.67 | 192544.00 | 8.90 | 1.61 | 63.49 | true | 0.089211;0.087824;0.089162;0.089239;0.086683 | 317440;317440;317440;317440;317440 | 4192;4256;9216;4160;4256 | 193984;192128;193696;190944;191808 | |
231 | resnetv13_stage3_relu35_fwd | Activation | [1,256,14,14] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 160.00 | 21973.33 | 37.80 | 4.53 | 25.09 | true | 0.386743;0.369501;0.374533;0.386292;0.371907 | 100352;100352;100352;100352;100352 | 22528;22528;21248;20736;22144 | 17024;128;224;128;128 | |
232 | resnetv13_stage3_conv54_fwd | Convolution | [1,256,14,14] | 5774 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 29.33 | 117641216 | 1048661.33 | 541045.33 | 7.70 | 74.00 | 4010.54 | false | 0.077057;0.076668;0.076736;0.076673;0.077477 | 117641216;117641216;117641216;117641216;117641216 | 1048640;1048640;1050912;1048704;1048576 | 544288;537600;541408;541216;540512 | |
232 | resnetv13_stage3_conv54_fwd | Convolution | [1,256,14,14] | 5774 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1003520 | 4096.00 | 62538.67 | 66.80 | 15.06 | 167.25 | true | 0.667786;0.668160;0.667607;0.667911;0.666432 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4608 | 58624;65888;62336;62816;62464 | |
233 | resnetv13_stage3_batchnorm54_fwd | BatchNorm | [1,1024,14,14] | 63 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1269760 | 16384.00 | 551370.67 | 23.90 | 2.24 | 181.39 | true | 0.235844;0.232739;0.245279;0.248570;0.236502 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 550752;552064;554560;551296;550048 | |
234 | add_resnetv13_stage3_activation17 | add_relu | [1,1024,14,14] | 101.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 50634.67 | 56.90 | 0.24 | 40.14 | true | 0.571278;0.572972;0.568726;0.568393;0.558830 | 200704;200704;200704;200704;200704 | 51616;50592;47776;49696;52512 | 802816;802816;802816;802816;802816 | |
235 | resnetv13_stage3_conv55_fwd | Convolution | [1,1024,14,14] | 5913.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.67 | 117490688 | 1048576.00 | 29269.33 | 3.10 | 109.01 | 1762.35 | false | 0.031248;0.031249;0.031248;0.031248;0.031247 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1055488;1048576;1048576;1048576 | 26880;27968;23360;38144;32960 | |
235 | resnetv13_stage3_conv55_fwd | Convolution | [1,1024,14,14] | 5913.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 250880 | 1024.00 | 170.67 | 36.40 | 210.00 | 57.90 | false | 0.366387;0.365029;0.361357;0.360456;0.364377 | 250880;250880;250880;250880;250880 | 1024;3840;1024;1024;1024 | 128;7296;128;0;256 | |
236 | resnetv13_stage3_batchnorm55_fwd | BatchNorm | [1,256,14,14] | 23.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 175541.33 | 8.60 | 1.77 | 63.49 | true | 0.086584;0.085149;0.086168;0.086694;0.086244 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 172608;181504;172512;185664;169152 | |
237 | resnetv13_stage3_relu36_fwd | Activation | [1,256,14,14] | 22.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 23637.33 | 35.10 | 4.25 | 25.09 | true | 0.350678;0.350932;0.351708;0.349812;0.351380 | 100352;100352;100352;100352;100352 | 0;0;0;0;0 | 23680;22528;24448;24704;22784 | |
238 | resnetv13_stage3_conv56_fwd | Convolution | [1,256,14,14] | 14759.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.00 | 140148736 | 70101.33 | 1050912.00 | 12.50 | 125.02 | 2189.82 | false | 0.124919;0.124922;0.124921;0.124916;0.124915 | 140148736;140148736;140148736;140148736;140148736 | 67104;59232;78336;64864;82432 | 1049728;1054464;1068768;1048544;978464 | |
238 | resnetv13_stage3_conv56_fwd | Convolution | [1,256,14,14] | 14759.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 3801088 | 2359360.00 | 3299349.33 | 31.50 | 0.67 | 380.11 | true | 0.311030;0.314292;0.315004;0.315684;0.318055 | 3801088;3801088;3801088;3801088;3801088 | 2359360;2359360;2359424;2359296;2359360 | 3310304;3291136;3296608;3285120;3377664 | |
239 | resnetv13_stage3_batchnorm56_fwd | BatchNorm | [1,256,14,14] | 88.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4256.00 | 184160.00 | 8.80 | 1.68 | 63.49 | true | 0.089214;0.088231;0.085635;0.088256;0.088197 | 317440;317440;317440;317440;317440 | 4256;10912;4096;4256;4256 | 183872;184448;183680;184160;189344 | |
240 | resnetv13_stage3_relu37_fwd | Activation | [1,256,14,14] | 25.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 224.00 | 25600.00 | 37.50 | 3.89 | 25.09 | true | 0.378086;0.373351;0.372955;0.379633;0.371818 | 100352;100352;100352;100352;100352 | 25536;25472;25792;25344;25856 | 224;224;224;224;128 | |
241 | resnetv13_stage3_conv57_fwd | Convolution | [1,256,14,14] | 6093 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048629.33 | 539509.33 | 7.70 | 74.07 | 3921.37 | false | 0.076637;0.076834;0.076861;0.077191;0.077204 | 117641216;117641216;117641216;117641216;117641216 | 1048672;1048640;1048576;1048704;1048576 | 539136;549472;539968;539424;536544 | |
241 | resnetv13_stage3_conv57_fwd | Convolution | [1,256,14,14] | 6093 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.33 | 1003520 | 4096.00 | 63946.67 | 66.50 | 14.75 | 188.17 | true | 0.665991;0.669621;0.658106;0.657457;0.671344 | 1003520;1003520;1003520;1003520;1003520 | 64288;54112;63360;64192;66816 | 4096;4096;4096;4096;9216 | |
242 | resnetv13_stage3_batchnorm57_fwd | BatchNorm | [1,1024,14,14] | 69.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 1269760 | 16384.00 | 556629.33 | 23.40 | 2.22 | 200.50 | true | 0.234455;0.233542;0.234891;0.239510;0.231808 | 1269760;1269760;1269760;1269760;1269760 | 556096;554880;557760;556032;557888 | 16384;16384;16384;16384;16384 | |
243 | add_resnetv13_stage3_activation18 | add_relu | [1,1024,14,14] | 98 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 46634.67 | 57.00 | 0.24 | 40.14 | true | 0.570196;0.568831;0.564029;0.571308;0.577491 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 46080;47616;44800;46336;47488 | |
244 | resnetv13_stage3_conv58_fwd | Convolution | [1,1024,14,14] | 5950.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.00 | 117490688 | 1048576.00 | 42666.67 | 3.10 | 107.67 | 1780.16 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048576;1048576;1060096 | 36096;28736;39872;54720;52032 | |
244 | resnetv13_stage3_conv58_fwd | Convolution | [1,1024,14,14] | 5950.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 250880 | 1024.00 | 1109.33 | 36.80 | 117.60 | 62.72 | false | 0.367841;0.368273;0.357136;0.367826;0.378016 | 250880;250880;250880;250880;250880 | 1024;1024;1024;1024;1024 | 1024;1152;1152;128;31744 | |
245 | resnetv13_stage3_batchnorm58_fwd | BatchNorm | [1,256,14,14] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 173258.67 | 8.70 | 1.79 | 63.49 | true | 0.086319;0.087813;0.086465;0.087543;0.087088 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 174912;176576;178112;168288;144320 | |
246 | resnetv13_stage3_relu38_fwd | Activation | [1,256,14,14] | 23.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 19893.33 | 35.20 | 5.04 | 25.09 | true | 0.352170;0.351287;0.351284;0.352539;0.351754 | 100352;100352;100352;100352;100352 | 0;0;0;0;0 | 20352;19872;20352;18944;19456 | |
247 | resnetv13_stage3_conv59_fwd | Convolution | [1,256,14,14] | 13523.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.67 | 140148736 | 69045.33 | 1064618.67 | 12.50 | 123.62 | 2167.24 | false | 0.124920;0.124921;0.124922;0.124919;0.124918 | 140148736;140148736;140148736;140148736;140148736 | 67904;60256;79360;72160;67072 | 1047328;1091968;1054656;1066624;1072576 | |
247 | resnetv13_stage3_conv59_fwd | Convolution | [1,256,14,14] | 13523.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 3801088 | 2359381.33 | 3260565.33 | 31.70 | 0.68 | 345.55 | true | 0.316503;0.317601;0.317687;0.317244;0.315255 | 3801088;3801088;3801088;3801088;3801088 | 2359296;2359488;2359360;2359424;2359360 | 3289952;3258336;3263168;3260192;3252000 | |
248 | resnetv13_stage3_batchnorm59_fwd | BatchNorm | [1,256,14,14] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4224.00 | 191562.67 | 8.80 | 1.62 | 63.49 | true | 0.088842;0.087894;0.088669;0.088026;0.087596 | 317440;317440;317440;317440;317440 | 191456;190048;191776;191456;193440 | 4256;4160;4096;6208;4256 | |
249 | resnetv13_stage3_relu39_fwd | Activation | [1,256,14,14] | 29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 100352 | 192.00 | 22229.33 | 37.60 | 4.48 | 23.16 | true | 0.376524;0.373573;0.373099;0.377368;0.378194 | 100352;100352;100352;100352;100352 | 22144;22976;22400;21120;22144 | 224;128;224;224;128 | |
250 | resnetv13_stage3_conv60_fwd | Convolution | [1,256,14,14] | 6776.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048597.33 | 539018.67 | 7.70 | 74.10 | 3921.37 | false | 0.076921;0.077971;0.076924;0.076470;0.077844 | 117641216;117641216;117641216;117641216;117641216 | 1048704;1048576;1048576;1048640;1048576 | 538816;541568;542464;534208;536672 | |
250 | resnetv13_stage3_conv60_fwd | Convolution | [1,256,14,14] | 6776.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 1003520 | 4096.00 | 63338.67 | 66.60 | 14.88 | 177.08 | true | 0.673487;0.663441;0.671142;0.661425;0.664218 | 1003520;1003520;1003520;1003520;1003520 | 8448;4096;4096;4096;4096 | 62400;61312;58496;69696;66304 | |
251 | resnetv13_stage3_batchnorm60_fwd | BatchNorm | [1,1024,14,14] | 69 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 1269760 | 16384.00 | 552501.33 | 23.40 | 2.23 | 200.50 | true | 0.236062;0.230992;0.243196;0.235168;0.231064 | 1269760;1269760;1269760;1269760;1269760 | 552224;555584;552896;551936;552384 | 16384;16384;16384;16384;17920 | |
252 | add_resnetv13_stage3_activation19 | add_relu | [1,1024,14,14] | 98 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 49824.00 | 55.90 | 0.24 | 40.14 | true | 0.566428;0.547594;0.558074;0.566611;0.551715 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;802816 | 50080;46752;49184;50592;50208 | |
253 | resnetv13_stage3_conv61_fwd | Convolution | [1,1024,14,14] | 6889.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.00 | 117490688 | 1048576.00 | 45376.00 | 3.10 | 107.40 | 1780.16 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 49152;35904;45760;42048;48320 | 1048576;1048576;1048576;1048576;1048576 | |
253 | resnetv13_stage3_conv61_fwd | Convolution | [1,1024,14,14] | 6889.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 250880 | 1024.00 | 597.33 | 36.90 | 154.74 | 57.90 | false | 0.362405;0.370617;0.368508;0.368382;0.370868 | 250880;250880;250880;250880;250880 | 1024;1024;1024;1024;1024 | 0;1280;384;128;1280 | |
254 | resnetv13_stage3_batchnorm61_fwd | BatchNorm | [1,256,14,14] | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 174474.67 | 8.80 | 1.78 | 63.49 | true | 0.087594;0.087506;0.087794;0.087439;0.087219 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 174400;176256;173792;174528;174496 | |
255 | resnetv13_stage3_relu40_fwd | Activation | [1,256,14,14] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 24458.67 | 35.10 | 4.10 | 25.09 | true | 0.350993;0.350716;0.350916;0.351182;0.350754 | 100352;100352;100352;100352;100352 | 0;0;0;0;0 | 23552;25344;24448;24096;24832 | |
256 | resnetv13_stage3_conv62_fwd | Convolution | [1,256,14,14] | 15874.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.00 | 140148736 | 65920.00 | 1072618.67 | 12.50 | 123.10 | 2189.82 | false | 0.124915;0.124918;0.124919;0.124918;0.124923 | 140148736;140148736;140148736;140148736;140148736 | 66272;65696;65792;68704;60288 | 1070464;1076224;1071168;1095744;1056832 | |
256 | resnetv13_stage3_conv62_fwd | Convolution | [1,256,14,14] | 15874.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 3801088 | 2359360.00 | 3262250.67 | 31.80 | 0.68 | 380.11 | true | 0.319696;0.317611;0.318734;0.316319;0.315941 | 3801088;3801088;3801088;3801088;3801088 | 2359360;2359296;2359360;2359616;2359360 | 3265280;3258112;3263360;3235840;3270944 | |
257 | resnetv13_stage3_batchnorm62_fwd | BatchNorm | [1,256,14,14] | 84.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4170.67 | 184074.67 | 8.80 | 1.69 | 63.49 | true | 0.089689;0.087750;0.087771;0.089076;0.087531 | 317440;317440;317440;317440;317440 | 4160;4256;4096;4256;4096 | 183872;183680;185536;184672;182912 | |
258 | resnetv13_stage3_relu41_fwd | Activation | [1,256,14,14] | 24.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 192.00 | 25792.00 | 37.80 | 3.86 | 25.09 | true | 0.385802;0.369299;0.372751;0.429057;0.376090 | 100352;100352;100352;100352;100352 | 25664;25728;26496;25344;25984 | 224;224;128;224;128 | |
259 | resnetv13_stage3_conv63_fwd | Convolution | [1,256,14,14] | 6833.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048618.67 | 543445.33 | 7.70 | 73.89 | 3921.37 | false | 0.076205;0.076670;0.076447;0.076964;0.077367 | 117641216;117641216;117641216;117641216;117641216 | 1053760;1048640;1048576;1048640;1048576 | 543584;545472;546432;535456;541280 | |
259 | resnetv13_stage3_conv63_fwd | Convolution | [1,256,14,14] | 6833.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 1003520 | 4096.00 | 59477.33 | 66.10 | 15.79 | 177.08 | true | 0.671450;0.661409;0.643821;0.669051;0.651091 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 59648;55520;56576;68320;62208 | |
260 | resnetv13_stage3_batchnorm63_fwd | BatchNorm | [1,1024,14,14] | 67.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1269760 | 16384.00 | 558677.33 | 23.50 | 2.21 | 181.39 | true | 0.242230;0.233936;0.237473;0.226830;0.234073 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 559808;556416;558400;558336;559296 | |
261 | add_resnetv13_stage3_activation20 | add_relu | [1,1024,14,14] | 104.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 44501.33 | 56.90 | 0.24 | 40.14 | true | 0.563168;0.570123;0.573322;0.566286;0.572030 | 200704;200704;200704;200704;200704 | 802816;802816;802816;802816;806656 | 42752;46080;45568;44160;43776 | |
262 | resnetv13_stage3_conv64_fwd | Convolution | [1,1024,14,14] | 6908 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.00 | 117490688 | 1048576.00 | 47808.00 | 3.10 | 107.16 | 1753.59 | false | 0.031248;0.031248;0.031248;0.031247;0.031247 | 117490688;117490688;117490688;117490688;117490688 | 1048576;1048576;1048576;1048576;1048576 | 34304;46272;61504;55488;41664 | |
262 | resnetv13_stage3_conv64_fwd | Convolution | [1,1024,14,14] | 6908 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 250880 | 1024.00 | 1365.33 | 36.70 | 105.00 | 53.76 | false | 0.362672;0.367450;0.370289;0.369795;0.364541 | 250880;250880;250880;250880;250880 | 1280;1024;1024;1024;1024 | 8960;1280;256;1792;1024 | |
263 | resnetv13_stage3_batchnorm64_fwd | BatchNorm | [1,256,14,14] | 25.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4096.00 | 157568.00 | 8.70 | 1.96 | 63.49 | true | 0.086494;0.086600;0.086132;0.086540;0.086890 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;14080 | 157632;156512;153280;158560;160960 | |
264 | resnetv13_stage3_relu42_fwd | Activation | [1,256,14,14] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 170.67 | 19498.67 | 35.20 | 5.10 | 25.09 | true | 0.352301;0.351801;0.349874;0.351643;0.354657 | 100352;100352;100352;100352;100352 | 0;0;0;512;2048 | 19840;19456;19200;18304;27008 | |
265 | resnetv13_stage3_conv65_fwd | Convolution | [1,256,14,14] | 15910.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.33 | 140148736 | 62805.33 | 1093504.00 | 12.50 | 121.20 | 2178.49 | false | 0.124920;0.124916;0.124919;0.124914;0.124922 | 140148736;140148736;140148736;140148736;140148736 | 49632;56672;88192;71008;60736 | 1121856;1093088;1033856;1079456;1107968 | |
265 | resnetv13_stage3_conv65_fwd | Convolution | [1,256,14,14] | 15910.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 3801088 | 2359402.67 | 3247818.67 | 31.70 | 0.68 | 345.55 | true | 0.316082;0.317969;0.306893;0.315978;0.317663 | 3801088;3801088;3801088;3801088;3801088 | 3226240;3257568;3299040;3254240;3231648 | 2359296;2359552;2360192;2359360;2359296 | |
266 | resnetv13_stage3_batchnorm65_fwd | BatchNorm | [1,256,14,14] | 85 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4234.67 | 193045.33 | 8.90 | 1.61 | 63.49 | true | 0.089788;0.087539;0.088927;0.089123;0.089133 | 317440;317440;317440;317440;317440 | 4256;4256;4096;4256;4192 | 190016;192448;193920;193504;193184 | |
267 | resnetv13_stage3_relu43_fwd | Activation | [1,256,14,14] | 24.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 224.00 | 21888.00 | 37.60 | 4.54 | 25.09 | true | 0.381301;0.372595;0.374933;0.374059;0.379045 | 100352;100352;100352;100352;100352 | 224;224;224;224;128 | 22144;22016;21888;19968;21760 | |
268 | resnetv13_stage3_conv66_fwd | Convolution | [1,256,14,14] | 6847.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 29.67 | 117641216 | 1049728.00 | 541621.33 | 7.70 | 73.93 | 3965.39 | false | 0.075803;0.076672;0.076878;0.076344;0.077427 | 117641216;117641216;117641216;117641216;117641216 | 540544;541664;542656;531776;544992 | 1053824;1048640;1048576;1051456;1049088 | |
268 | resnetv13_stage3_conv66_fwd | Convolution | [1,256,14,14] | 6847.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.33 | 1003520 | 4096.00 | 61578.67 | 66.90 | 15.28 | 188.17 | true | 0.670830;0.669174;0.666487;0.662093;0.670835 | 1003520;1003520;1003520;1003520;1003520 | 4096;4096;4096;4096;4096 | 62688;61504;60544;72032;57984 | |
269 | resnetv13_stage3_batchnorm66_fwd | BatchNorm | [1,1024,14,14] | 69 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 1269760 | 16384.00 | 554069.33 | 23.30 | 2.23 | 211.63 | true | 0.234571;0.231391;0.241296;0.226745;0.231852 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16512;16384 | 555520;550912;553024;555264;553920 | |
270 | add_resnetv13_stage3_activation21 | add_relu | [1,1024,14,14] | 103.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 48330.67 | 56.50 | 0.24 | 40.14 | true | 0.561637;0.573659;0.560057;0.572614;0.561102 | 200704;200704;200704;200704;200704 | 43680;51616;49184;47136;48672 | 812544;802816;802816;802816;802816 | |
271 | resnetv13_stage3_conv67_fwd | Convolution | [1,1024,14,14] | 6912.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.67 | 117490688 | 1048576.00 | 33706.67 | 3.10 | 108.56 | 1762.35 | false | 0.031248;0.031248;0.031247;0.031248;0.031248 | 117490688;117490688;117490688;117490688;117490688 | 37760;44608;27968;26560;35392 | 1048576;1048576;1048576;1048576;1048576 | |
271 | resnetv13_stage3_conv67_fwd | Convolution | [1,1024,14,14] | 6912.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 250880 | 1024.00 | 1152.00 | 36.70 | 115.29 | 57.90 | false | 0.366549;0.368496;0.365995;0.371758;0.353840 | 250880;250880;250880;250880;250880 | 1024;256;9984;1152;1280 | 1024;1024;1024;1024;1024 | |
272 | resnetv13_stage3_batchnorm67_fwd | BatchNorm | [1,256,14,14] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 317440 | 4096.00 | 174410.67 | 8.70 | 1.78 | 68.02 | true | 0.087461;0.087083;0.087327;0.087391;0.087402 | 317440;317440;317440;317440;317440 | 4096;4096;4096;4096;4096 | 173888;175232;174304;174144;174784 | |
273 | resnetv13_stage3_relu44_fwd | Activation | [1,256,14,14] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 0.00 | 24448.00 | 35.20 | 4.10 | 25.09 | true | 0.351457;0.351935;0.350941;0.352416;0.351232 | 100352;100352;100352;100352;100352 | 0;0;0;5120;0 | 24704;24704;24832;23936;23840 | |
274 | resnetv13_stage3_conv68_fwd | Convolution | [1,256,14,14] | 15809 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 64.33 | 140148736 | 64842.67 | 1078986.67 | 12.50 | 122.53 | 2178.49 | false | 0.124916;0.124919;0.124918;0.124916;0.124920 | 140148736;140148736;140148736;140148736;140148736 | 59744;59616;66688;75808;68096 | 1085664;1118848;1090976;1042688;1060320 | |
274 | resnetv13_stage3_conv68_fwd | Convolution | [1,256,14,14] | 15809 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.00 | 3801088 | 2359360.00 | 3270688.00 | 31.70 | 0.68 | 380.11 | true | 0.317550;0.319166;0.315884;0.317605;0.315447 | 3801088;3801088;3801088;3801088;3801088 | 2359488;2359360;2359360;2359360;2359360 | 3263744;3217568;3258272;3313184;3290048 | |
275 | resnetv13_stage3_batchnorm68_fwd | BatchNorm | [1,256,14,14] | 82 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 317440 | 4234.67 | 183296.00 | 8.90 | 1.69 | 63.49 | true | 0.088951;0.089061;0.088172;0.089009;0.089006 | 317440;317440;317440;317440;317440 | 4192;4256;4256;10144;4096 | 183456;183296;183744;182816;183136 | |
276 | resnetv13_stage3_relu45_fwd | Activation | [1,256,14,14] | 24.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 100352 | 192.00 | 25728.00 | 38.00 | 3.87 | 25.09 | true | 0.401857;0.377274;0.377815;0.385679;0.371801 | 100352;100352;100352;100352;100352 | 224;128;224;224;128 | 26560;25472;25472;25792;25920 | |
277 | resnetv13_stage3_conv69_fwd | Convolution | [1,256,14,14] | 6808.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 30.00 | 117641216 | 1048629.33 | 540853.33 | 7.80 | 74.01 | 3921.37 | false | 0.077571;0.078124;0.076530;0.076952;0.078257 | 117641216;117641216;117641216;117641216;117641216 | 536832;540064;537760;544736;552480 | 1048832;1048640;1048576;1048608;1048640 | |
277 | resnetv13_stage3_conv69_fwd | Convolution | [1,256,14,14] | 6808.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.33 | 1003520 | 4096.00 | 62368.00 | 66.20 | 15.10 | 188.17 | true | 0.671095;0.657864;0.660576;0.664501;0.661156 | 1003520;1003520;1003520;1003520;1003520 | 4384;4096;4096;4096;4096 | 68384;63072;65152;58880;51200 | |
278 | resnetv13_stage3_batchnorm69_fwd | BatchNorm | [1,1024,14,14] | 67.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1269760 | 16384.00 | 557098.67 | 23.60 | 2.21 | 181.39 | true | 0.227216;0.239942;0.239422;0.230466;0.236999 | 1269760;1269760;1269760;1269760;1269760 | 16384;16384;16384;16384;16384 | 557632;556928;556736;555840;558016 | |
279 | add_resnetv13_stage3_activation22 | add_relu | [1,1024,14,14] | 97.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 5.00 | 200704 | 802816.00 | 44885.33 | 57.20 | 0.24 | 40.14 | true | 0.576611;0.564856;0.570253;0.570996;0.573946 | 200704;200704;200704;200704;200704 | 44544;39808;45568;46592;44544 | 802816;803072;802816;802816;802816 | |
280 | resnetv13_stage4_conv0_fwd | Convolution | [1,1024,14,14] | 4336 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 68.33 | 67133952 | 2097152.00 | 904170.67 | 3.10 | 22.37 | 982.45 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 67133952;67133952;67133952;67133952;67133952 | 2097152;2097152;2097152;2097152;2097152 | 902816;905792;903552;905984;903168 | |
280 | resnetv13_stage4_conv0_fwd | Convolution | [1,1024,14,14] | 4336 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 125440 | 2048.00 | 2048.00 | 26.10 | 30.62 | 31.36 | false | 0.260043;0.261950;0.257032;0.262621;0.261974 | 125440;125440;125440;125440;125440 | 2048;2048;2048;2048;2048 | 2048;2048;2048;2048;2048 | |
281 | resnetv13_stage4_batchnorm0_fwd | BatchNorm | [1,512,7,7] | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 175616 | 8192.00 | 6058.67 | 14.20 | 12.32 | 40.53 | true | 0.147185;0.142178;0.140103;0.143849;0.140107 | 175616;175616;175616;175616;175616 | 8192;8192;8192;8192;13312 | 6272;5888;6144;5504;6144 | |
282 | resnetv13_stage4_relu0_fwd | Activation | [1,512,7,7] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 50176 | 0.00 | 0.00 | 28.20 | 0.00 | 12.54 | true | 0.282350;0.282020;0.280061;0.281273;0.281699 | 50176;50176;50176;50176;50176 | 0;0;0;0;0 | 0;0;0;0;0 | |
283 | resnetv13_stage4_conv1_fwd | Convolution | [1,512,7,7] | 19387 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 122.67 | 279609344 | 16877856.00 | 3497141.33 | 12.50 | 13.72 | 2279.42 | true | 0.124973;0.124973;0.124973;0.124973;0.124973 | 279609344;279609344;279609344;279609344;279609344 | 16877856;16877856;16877856;16877856;16877856 | 3506656;3477728;3500128;3523040;3484640 | |
283 | resnetv13_stage4_conv1_fwd | Convolution | [1,512,7,7] | 19387 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 35.67 | 15204352 | 9589098.67 | 15319477.33 | 58.50 | 0.61 | 426.29 | true | 0.584955;0.586352;0.583410;0.582140;0.585786 | 15204352;15204352;15204352;15204352;15204352 | 9583488;9580480;9603328;9569728;9612224 | 15313856;15339808;15319584;15293952;15324992 | |
284 | resnetv13_stage4_batchnorm1_fwd | BatchNorm | [1,512,7,7] | 84.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 175616 | 8352.00 | 0.00 | 15.20 | 21.03 | 35.12 | false | 0.150841;0.147870;0.152620;0.152920;0.151885 | 175616;175616;175616;175616;175616 | 0;0;0;0;0 | 8352;8352;13472;8352;8352 | |
285 | resnetv13_stage4_relu1_fwd | Activation | [1,512,7,7] | 16.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 50176 | 288.00 | 0.00 | 32.70 | 174.22 | 12.54 | false | 0.324770;0.327057;0.327853;0.324838;0.327852 | 50176;50176;50176;50176;50176 | 288;288;288;288;288 | 0;0;0;0;0 | |
286 | resnetv13_stage4_conv2_fwd | Convolution | [1,512,7,7] | 8302.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 52.00 | 134318080 | 4194304.00 | 0.00 | 5.00 | 32.02 | 2583.04 | false | 0.049183;0.049962;0.050034;0.049216;0.049707 | 134318080;134318080;134318080;134318080;134318080 | 4194304;4194304;4194304;4194304;4194304 | 0;0;0;0;0 | |
286 | resnetv13_stage4_conv2_fwd | Convolution | [1,512,7,7] | 8302.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 501760 | 8192.00 | 0.00 | 59.00 | 61.25 | 107.51 | false | 0.589309;0.588225;0.590317;0.601566;0.588926 | 501760;501760;501760;501760;501760 | 8192;8192;16640;8192;8192 | 0;0;0;0;0 | |
287 | resnetv13_stage4_batchnorm2_fwd | BatchNorm | [1,2048,7,7] | 53.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 702464 | 32768.00 | 256.00 | 17.50 | 21.27 | 100.35 | false | 0.169505;0.178679;0.173079;0.171867;0.182287 | 702464;702464;702464;702464;702464 | 32768;32768;32768;32768;32768 | 0;1024;0;768;0 | |
288 | resnetv13_stage4_conv3_fwd | Convolution | [1,1024,14,14] | 16308.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 95.67 | 268535808 | 9191552.00 | 1110912.00 | 5.00 | 26.07 | 2806.98 | false | 0.049888;0.049834;0.049904;0.049857;0.049907 | 268535808;268535808;268535808;268535808;268535808 | 9191552;9191552;9191552;9191552;9191552 | 1112576;1111136;1111136;1110464;1109888 | |
289 | resnetv13_stage4_batchnorm3_fwd | BatchNorm | [1,2048,7,7] | 118.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.33 | 702464 | 32928.00 | 0.00 | 20.50 | 21.33 | 95.79 | false | 0.195495;0.215851;0.214348;0.199452;0.199936 | 702464;702464;702464;702464;702464 | 32928;32928;32928;32928;32928 | 0;0;0;0;0 | |
290 | add_resnetv13_stage4_activation0 | add_relu | [1,2048,7,7] | 54.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 4.00 | 100352 | 54122.67 | 0.00 | 44.50 | 1.85 | 25.09 | true | 0.443196;0.442149;0.455922;0.444636;0.447013 | 100352;100352;100352;100352;100352 | 56256;49344;51648;54464;57408 | 0;0;0;0;0 | |
291 | resnetv13_stage4_conv4_fwd | Convolution | [1,2048,7,7] | 8151.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 129.67 | 134242816 | 4194304.00 | 24501.33 | 3.10 | 31.82 | 1035.29 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 134242816;134242816;134242816;134242816;134242816 | 4194304;4194304;4194304;4194304;4194304 | 25600;24192;24192;25120;24192 | |
291 | resnetv13_stage4_conv4_fwd | Convolution | [1,2048,7,7] | 8151.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 125440 | 2272.00 | 597.33 | 27.70 | 43.72 | 28.95 | false | 0.279802;0.268518;0.283081;0.275069;0.275993 | 125440;125440;125440;125440;125440 | 2272;2272;2784;2272;2272 | 512;1024;768;512;512 | |
292 | resnetv13_stage4_batchnorm4_fwd | BatchNorm | [1,512,7,7] | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 175616 | 8192.00 | 102314.67 | 14.30 | 1.59 | 35.12 | true | 0.142923;0.141678;0.142564;0.142934;0.142963 | 175616;175616;175616;175616;175616 | 8192;8192;8192;8192;8192 | 102144;102272;102400;102400;102272 | |
293 | resnetv13_stage4_relu2_fwd | Activation | [1,512,7,7] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 50176 | 0.00 | 2218.67 | 28.10 | 22.62 | 12.54 | false | 0.282300;0.281999;0.280940;0.279595;0.281116 | 50176;50176;50176;50176;50176 | 0;0;0;0;0 | 2304;2176;2304;2048;2176 | |
294 | resnetv13_stage4_conv5_fwd | Convolution | [1,512,7,7] | 19367.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 122.67 | 279609344 | 16877856.00 | 3476021.33 | 12.50 | 13.74 | 2279.42 | true | 0.124973;0.124974;0.124973;0.124973;0.124973 | 279609344;279609344;279609344;279609344;279609344 | 16877856;16877856;16877856;16877856;16877856 | 3513952;3483232;3465952;3478880;3455072 | |
294 | resnetv13_stage4_conv5_fwd | Convolution | [1,512,7,7] | 19367.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 36.00 | 15204352 | 9566954.67 | 14580725.33 | 58.00 | 0.63 | 422.34 | true | 0.580520;0.578737;0.579673;0.580864;0.580348 | 15204352;15204352;15204352;15204352;15204352 | 9564032;9566080;9570752;9596800;9549888 | 14541984;14573856;14590752;14577568;14602688 | |
295 | resnetv13_stage4_batchnorm5_fwd | BatchNorm | [1,512,7,7] | 84.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 175616 | 8352.00 | 0.00 | 15.30 | 21.03 | 35.12 | false | 0.151582;0.152548;0.154134;0.156684;0.152021 | 175616;175616;175616;175616;175616 | 8352;8352;10400;8352;8352 | 0;0;0;0;0 | |
296 | resnetv13_stage4_relu3_fwd | Activation | [1,512,7,7] | 16.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 50176 | 288.00 | 0.00 | 33.20 | 174.22 | 11.58 | false | 0.341360;0.327198;0.327778;0.326962;0.351147 | 50176;50176;50176;50176;50176 | 288;288;288;288;288 | 0;0;0;0;0 | |
297 | resnetv13_stage4_conv6_fwd | Convolution | [1,512,7,7] | 8301.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 51.33 | 134318080 | 4194304.00 | 0.00 | 4.90 | 32.02 | 2616.60 | false | 0.048862;0.049535;0.049787;0.049265;0.049108 | 134318080;134318080;134318080;134318080;134318080 | 4194304;4194304;4194304;4194304;4194304 | 0;0;0;0;0 | |
297 | resnetv13_stage4_conv6_fwd | Convolution | [1,512,7,7] | 8301.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 501760 | 8192.00 | 0.00 | 58.60 | 61.25 | 100.35 | false | 0.585341;0.579313;0.586242;0.590953;0.587619 | 501760;501760;501760;501760;501760 | 8192;8192;8192;8192;8192 | 0;0;0;0;0 | |
298 | resnetv13_stage4_batchnorm6_fwd | BatchNorm | [1,2048,7,7] | 54.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.33 | 702464 | 32768.00 | 213.33 | 18.00 | 21.30 | 95.79 | false | 0.176010;0.173822;0.187108;0.180262;0.182565 | 702464;702464;702464;702464;702464 | 256;256;128;256;128 | 32768;32768;32768;32768;32768 | |
299 | add_resnetv13_stage4_activation1 | add_relu | [1,2048,7,7] | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 4.00 | 100352 | 401472.00 | 22560.00 | 42.40 | 0.24 | 25.09 | true | 0.425237;0.427908;0.426370;0.418360;0.420741 | 100352;100352;100352;100352;100352 | 401472;401728;401472;401472;401472 | 21696;23360;22208;23360;22112 | |
300 | resnetv13_stage4_conv7_fwd | Convolution | [1,2048,7,7] | 8461.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 130.00 | 134242816 | 4194304.00 | 281237.33 | 3.10 | 29.99 | 1032.64 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 134242816;134242816;134242816;134242816;134242816 | 4194304;4194304;4194304;4194304;4194304 | 282272;279808;281888;279712;282016 | |
300 | resnetv13_stage4_conv7_fwd | Convolution | [1,2048,7,7] | 8461.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 125440 | 2197.33 | 0.00 | 26.20 | 57.09 | 31.36 | false | 0.261604;0.264375;0.263260;0.262062;0.259931 | 125440;125440;125440;125440;125440 | 2272;2048;2272;2048;2272 | 0;0;0;0;0 | |
301 | resnetv13_stage4_batchnorm7_fwd | BatchNorm | [1,512,7,7] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 175616 | 8192.00 | 30122.67 | 14.30 | 4.58 | 40.53 | true | 0.143028;0.140625;0.143762;0.141866;0.143076 | 175616;175616;175616;175616;175616 | 8192;8192;8192;8192;8192 | 29184;30592;30592;33024;28800 | |
302 | resnetv13_stage4_relu4_fwd | Activation | [1,512,7,7] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 50176 | 0.00 | 597.33 | 28.20 | 84.00 | 12.54 | false | 0.282484;0.281925;0.279927;0.281602;0.281193 | 50176;50176;50176;50176;50176 | 896;640;512;640;384 | 256;0;0;0;0 | |
303 | resnetv13_stage4_conv8_fwd | Convolution | [1,512,7,7] | 19366.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 122.00 | 279609344 | 16877856.00 | 3517152.00 | 12.50 | 13.71 | 2291.88 | true | 0.124971;0.124972;0.124972;0.124972;0.124973 | 279609344;279609344;279609344;279609344;279609344 | 3501664;3488736;3562336;3500256;3549536 | 16877856;16877856;16877856;16877856;16877856 | |
303 | resnetv13_stage4_conv8_fwd | Convolution | [1,512,7,7] | 19366.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 34.33 | 15204352 | 9548117.33 | 14133664.00 | 57.90 | 0.64 | 442.85 | true | 0.578167;0.581087;0.579105;0.577049;0.578556 | 15204352;15204352;15204352;15204352;15204352 | 9559936;9576448;9538944;9545472;9534400 | 14154176;14165312;14091712;14151648;14095168 | |
304 | resnetv13_stage4_batchnorm8_fwd | BatchNorm | [1,512,7,7] | 80.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 175616 | 8352.00 | 0.00 | 15.10 | 21.03 | 35.12 | false | 0.155299;0.149003;0.153166;0.152235;0.147139 | 175616;175616;175616;175616;175616 | 8352;8352;8352;8352;8352 | 0;0;0;0;0 | |
305 | resnetv13_stage4_relu5_fwd | Activation | [1,512,7,7] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 50176 | 288.00 | 0.00 | 32.70 | 174.22 | 12.54 | false | 0.326198;0.325706;0.328294;0.325601;0.329002 | 50176;50176;50176;50176;50176 | 0;0;0;0;0 | 288;288;288;288;288 | |
306 | resnetv13_stage4_conv9_fwd | Convolution | [1,512,7,7] | 8182.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 52.00 | 134318080 | 4194304.00 | 0.00 | 4.90 | 32.02 | 2583.04 | false | 0.049149;0.049876;0.049033;0.049488;0.049725 | 134318080;134318080;134318080;134318080;134318080 | 0;0;0;0;0 | 4194304;4194304;4194304;4194304;4194304 | |
306 | resnetv13_stage4_conv9_fwd | Convolution | [1,512,7,7] | 8182.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 501760 | 8192.00 | 0.00 | 59.10 | 61.25 | 100.35 | false | 0.592434;0.588690;0.591046;0.590045;0.591508 | 501760;501760;501760;501760;501760 | 8192;8192;13312;8192;8192 | 0;0;0;0;0 | |
307 | resnetv13_stage4_batchnorm9_fwd | BatchNorm | [1,2048,7,7] | 56 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.33 | 702464 | 32768.00 | 0.00 | 18.00 | 21.44 | 95.79 | false | 0.181592;0.189508;0.181067;0.174091;0.178590 | 702464;702464;702464;702464;702464 | 0;0;0;0;0 | 32768;32768;32768;32768;32768 | |
308 | add_resnetv13_stage4_activation2 | add_relu | [1,2048,7,7] | 61.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::AddReluKernel, float*, float*, float*, mxnet::OpReqType>(int, float*, float*, float*, mxnet::OpReqType) | 4.00 | 100352 | 401408.00 | 3648.00 | 42.40 | 0.25 | 25.09 | true | 0.423961;0.418778;0.426982;0.422067;0.428736 | 100352;100352;100352;100352;100352 | 401408;401408;401408;401408;401408 | 3936;3136;3904;2880;3904 | |
309 | resnetv13_pool1_fwd | Pooling | [1,2048,7,7] | 184.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.67 | 144148 | 597.33 | 213.33 | 11.60 | 177.81 | 18.80 | false | 0.116268;0.116366;0.116326;0.116441;0.116318 | 144148;144148;144148;144148;144148 | 1024;512;512;768;512 | 0;640;0;640;0 | |
310 | resnetv13_dense0_fwd | FullyConnected | [1,2048,1,1] | 2597 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void gemv2T_kernel_val<int, int, float, float, float, 128, 16, 4, 4, false, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>, float, float) | 18.67 | 4431000 | 8193152.00 | 998378.67 | 9.80 | 0.48 | 237.37 | true | 0.098182;0.097981;0.097783;0.098495;0.098287 | 4431000;4431000;4431000;4431000;4431000 | 8211584;8193152;8193152;8193152;8193152 | 997728;998624;997984;998784;998528 | |
310 | resnetv13_dense0_fwd | FullyConnected | [1,2048,1,1] | 2597 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 3.67 | 1000 | 4480.00 | 0.00 | 12.20 | 0.22 | 0.27 | true | 0.122150;0.122171;0.122033;0.122076;0.122041 | 1000;1000;1000;1000;1000 | 4480;4480;4480;4480;4224 | 0;0;0;0;0 |
Showing 1 to 415 of 415 entries