GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | resnetv24_batchnorm0_fwd | BatchNorm | [256,3,224,224] | 12164 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 430.33 | 231604224 | 132465536.00 | 134171221.33 | 94.90 | 0.87 | 538.20 | true | 0.948535;0.954299;0.950202;0.948112;0.949604 | 231604224;231604224;231604224;231604224;231604224 | 151733536;134873632;132465216;130057760;130056736 | 157099456;137445632;132537344;132462944;132530688 | |
0 | resnetv24_batchnorm0_fwd | BatchNorm | [256,3,224,224] | 12164 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>) | 3.67 | 0 | 1504.00 | 5632.00 | 12.10 | 0.00 | 0.00 | true | 0.122309;0.121125;0.120878;0.121125;0.121004 | 0;0;0;0;0 | 1504;1504;1504;1504;1504 | 5632;5632;5632;5632;5632 | |
1 | resnetv24_conv0_fwd | Convolution | [256,3,224,224] | 4121771.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_medium_nn_v1 | 5133.00 | 62889394176 | 11323178.67 | 287966005.33 | 12.70 | 210.13 | 12251.98 | false | 0.126862;0.126152;0.126917;0.126424;0.214912 | 62889394176;62889394176;62889394176;62889394176;62889394176 | 11633632;11160384;11171456;11164448;11901120 | 289812992;286394304;286399424;287685600;294393568 | |
1 | resnetv24_conv0_fwd | Convolution | [256,3,224,224] | 4121771.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 2144.00 | 69546.67 | 7.60 | 0.00 | 0.00 | true | 0.075508;0.075561;0.075676;0.075660;0.075601 | 0;0;0;0;0 | 2144;2144;2144;2144;2144 | 46976;80768;79232;67072;62336 | |
2 | resnetv24_batchnorm1_fwd | BatchNorm | [256,64,112,112] | 199910.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2108.33 | 1241513984 | 143455712.00 | 570948160.00 | 95.40 | 1.74 | 588.86 | true | 0.953965;0.954277;0.953967;0.953782;0.954340 | 1241513984;1241513984;1241513984;1241513984;1241513984 | 561132256;573393888;574321152;569200512;570250080 | 140263712;145728512;141066688;144362016;144938432 | |
3 | resnetv24_relu0_fwd | Activation | [256,64,112,112] | 42842 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2286.00 | 411041792 | 142553173.33 | 564679306.67 | 99.90 | 0.58 | 179.81 | true | 0.998988;0.999023;0.999007;0.998998;0.998988 | 411041792;411041792;411041792;411041792;411041792 | 550800192;563429088;572648704;567496192;563112640 | 138887424;142925824;149043936;142475680;142258016 | |
4 | resnetv24_pool0_fwd | Pooling | [256,64,112,112] | 649398 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 1663.00 | 51380224 | 179586197.33 | 140975904.00 | 72.30 | 0.16 | 30.90 | true | 0.722965;0.722979;0.722903;0.722904;0.722932 | 51380224;51380224;51380224;51380224;51380224 | 175760896;183244000;177694624;180651712;180412256 | 141003776;139967584;141111040;140812896;141604960 | |
5 | resnetv24_stage1_batchnorm0_fwd | BatchNorm | [256,64,56,56] | 50910 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 513.00 | 316669952 | 96342346.67 | 95608896.00 | 87.20 | 1.65 | 617.29 | true | 0.872595;0.872431;0.871924;0.872238;0.872146 | 316669952;316669952;316669952;316669952;316669952 | 89918560;102767328;96341152;89918048;109185952 | 89171616;102005664;95640064;89180960;108298464 | |
6 | resnetv24_stage1_activation0 | Activation | [256,64,56,56] | 10799.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 538.00 | 102760448 | 70648128.00 | 70623712.00 | 98.70 | 0.73 | 191.00 | true | 0.986927;0.987141;0.986684;0.986941;0.987087 | 102760448;102760448;102760448;102760448;102760448 | 64225696;67436960;80281728;80281952;61025440 | 64203776;64203264;83465344;83464096;64198400 | |
7 | resnetv24_stage1_conv0_fwd | Convolution | [256,64,56,56] | 482001 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 586.33 | 6679429120 | 19272170.67 | 19266080.00 | 24.50 | 173.32 | 11391.87 | false | 0.245241;0.245293;0.245428;0.245286;0.245223 | 6679429120;6679429120;6679429120;6679429120;6679429120 | 19271552;19273344;35330688;19271616;16059136 | 19265952;19266272;35322784;19261536;19266016 | |
7 | resnetv24_stage1_conv0_fwd | Convolution | [256,64,56,56] | 482001 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 352.00 | 26325.33 | 6.10 | 0.00 | 0.00 | true | 0.061189;0.061183;0.061227;0.061225;0.061174 | 0;0;0;0;0 | 352;352;7008;352;352 | 26496;26240;40192;26240;26240 | |
8 | resnetv24_stage1_batchnorm1_fwd | BatchNorm | [256,64,56,56] | 14578 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 515.00 | 316669952 | 90986101.33 | 91038506.67 | 87.80 | 1.74 | 614.89 | true | 0.878031;0.877966;0.878408;0.878697;0.878216 | 316669952;316669952;316669952;316669952;316669952 | 89956736;93176704;96397824;89965440;89973376 | 86704320;93126944;93126944;86704416;93126944 | |
9 | resnetv24_stage1_activation1 | Activation | [256,64,56,56] | 10858.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.00 | 102760448 | 82422762.67 | 81324928.00 | 98.70 | 0.63 | 189.95 | true | 0.987301;0.987126;0.986658;0.987124;0.987856 | 102760448;102760448;102760448;102760448;102760448 | 96301952;70632320;77040512;64208128;118777280 | 96338336;70648224;80281728;64225824;118817184 | |
10 | resnetv24_stage1_conv1_fwd | Convolution | [256,64,56,56] | 3979356.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 2962.00 | 31855738880 | 76054357.33 | 143913184.00 | 24.80 | 144.82 | 10754.81 | false | 0.247983;0.248009;0.247981;0.247966;0.247949 | 31855738880;31855738880;31855738880;31855738880;31855738880 | 78344192;75163200;74673824;77090656;75909216 | 144758144;143963872;144327552;142844448;143448128 | |
10 | resnetv24_stage1_conv1_fwd | Convolution | [256,64,56,56] | 3979356.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 7.00 | 237568 | 154048.00 | 262400.00 | 6.20 | 0.57 | 33.94 | true | 0.062449;0.062435;0.062443;0.062438;0.062437 | 237568;237568;237568;237568;237568 | 262528;253312;262144;263552;262528 | 154048;154048;154048;154048;156096 | |
11 | resnetv24_stage1_batchnorm2_fwd | BatchNorm | [256,64,56,56] | 14637 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 515.00 | 316669952 | 89918282.67 | 88907925.33 | 87.30 | 1.77 | 614.89 | true | 0.872693;0.872336;0.872138;0.872473;0.872760 | 316669952;316669952;316669952;316669952;316669952 | 86706880;109185760;93129184;89918496;86707168 | 88904480;107959584;88908416;88900032;88910880 | |
12 | resnetv24_stage1_activation2 | Activation | [256,64,56,56] | 10975 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 543.00 | 102760448 | 65296021.33 | 66345216.00 | 98.80 | 0.78 | 189.25 | true | 0.987441;0.987162;0.987554;0.987600;0.987858 | 102760448;102760448;102760448;102760448;102760448 | 73860000;67436960;67436672;61014432;57803168 | 70626560;70624640;64204032;64206976;57780608 | |
13 | resnetv24_stage1_conv2_fwd | Convolution | [256,64,56,56] | 1587013 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2294.00 | 26717716480 | 128382421.33 | 535199637.33 | 24.80 | 40.26 | 11646.78 | false | 0.248026;0.248055;0.248047;0.248047;0.248061 | 26717716480;26717716480;26717716480;26717716480;26717716480 | 128010272;129835040;128634240;127790720;128502752 | 539494176;533364192;532740544;526666752;542391520 | |
13 | resnetv24_stage1_conv2_fwd | Convolution | [256,64,56,56] | 1587013 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 29568.00 | 6.10 | 0.00 | 0.00 | true | 0.061193;0.061333;0.061183;0.061195;0.061280 | 0;0;0;0;0 | 96;96;96;96;96 | 26240;44800;36480;25984;25984 | |
14 | resnetv24_stage1_conv3_fwd | Convolution | [256,64,56,56] | 1584391.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2293.67 | 26717716480 | 129233930.67 | 536559786.67 | 24.80 | 40.13 | 11648.47 | false | 0.248056;0.248067;0.248047;0.248041;0.248059 | 26717716480;26717716480;26717716480;26717716480;26717716480 | 128915040;129086112;129714112;129700640;128286464 | 534310304;538861920;534357152;538618752;536703456 | |
14 | resnetv24_stage1_conv3_fwd | Convolution | [256,64,56,56] | 1584391.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 26197.33 | 6.10 | 0.00 | 0.00 | true | 0.061154;0.061170;0.061196;0.061174;0.061152 | 0;0;0;0;0 | 96;96;96;96;96 | 26112;26240;26240;26880;26112 | |
15 | resnetv24_stage1__plus0 | elemwise_add | [256,256,56,56] | 52588.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 2983.00 | 205520896 | 198966944.00 | 496018709.33 | 97.50 | 0.30 | 68.90 | true | 0.975409;0.975348;0.975544;0.975480;0.975605 | 205520896;205520896;205520896;205520896;205520896 | 208040544;199540288;198442304;198918240;197824256 | 506835840;496857984;497718912;493479232;493376960 | |
16 | resnetv24_stage1_batchnorm3_fwd | BatchNorm | [256,256,56,56] | 58001.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2041.33 | 1266679808 | 144946410.67 | 576014197.33 | 87.40 | 1.76 | 620.52 | true | 0.874244;0.874357;0.874652;0.874376;0.874368 | 1266679808;1266679808;1266679808;1266679808;1266679808 | 145449472;145459296;144827360;144562400;143837568 | 576148832;570045344;578273344;576960256;574933504 | |
17 | resnetv24_stage1_activation3 | Activation | [256,256,56,56] | 43206 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2303.33 | 411041792 | 139685173.33 | 559147146.67 | 99.90 | 0.59 | 178.46 | true | 0.998990;0.998995;0.999046;0.998998;0.998963 | 411041792;411041792;411041792;411041792;411041792 | 138468320;139690880;139808896;141908096;139555744 | 554101536;562992384;560347520;563422112;550568896 | |
18 | resnetv24_stage1_conv4_fwd | Convolution | [256,256,56,56] | 1686046.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2062.67 | 26409435136 | 145854549.33 | 142905632.00 | 24.90 | 91.46 | 12803.54 | false | 0.248583;0.248915;0.248894;0.248818;0.248916 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 147729568;146084096;145785888;145633376;145693664 | 143403360;142513824;145697120;142208384;142799712 | |
18 | resnetv24_stage1_conv4_fwd | Convolution | [256,256,56,56] | 1686046.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 25728.00 | 6.10 | 0.00 | 0.00 | true | 0.061176;0.061183;0.061185;0.061215;0.061173 | 0;0;0;0;0 | 96;96;96;352;96 | 25984;26240;25984;24960;25216 | |
19 | resnetv24_stage1_batchnorm4_fwd | BatchNorm | [256,64,56,56] | 14541.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 515.00 | 316669952 | 90989578.67 | 92043776.00 | 87.30 | 1.73 | 614.89 | true | 0.872868;0.872267;0.872619;0.872684;0.872414 | 316669952;316669952;316669952;316669952;316669952 | 86706528;93129184;89917856;109185504;89921696 | 86726688;96319616;89898368;109178496;89913344 | |
20 | resnetv24_stage1_activation4 | Activation | [256,64,56,56] | 10897 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.00 | 102760448 | 66366538.67 | 64204842.67 | 98.70 | 0.79 | 189.95 | true | 0.986598;0.987290;0.987448;0.987158;0.987003 | 102760448;102760448;102760448;102760448;102760448 | 61014592;64225696;67436960;93127072;67436960 | 64201888;64206208;64204544;89896448;64203776 | |
21 | resnetv24_stage1_conv5_fwd | Convolution | [256,64,56,56] | 3966288.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 2956.33 | 31855738880 | 76693482.67 | 143894069.33 | 24.80 | 144.41 | 10775.42 | false | 0.247930;0.247933;0.247946;0.247937;0.247964 | 31855738880;31855738880;31855738880;31855738880;31855738880 | 75678656;76176256;76344672;85310144;77559520 | 142294880;143527008;143561376;148516704;144593824 | |
21 | resnetv24_stage1_conv5_fwd | Convolution | [256,64,56,56] | 3966288.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 237568 | 147648.00 | 259669.33 | 6.20 | 0.58 | 47.51 | true | 0.062360;0.062362;0.062367;0.062362;0.062365 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 253312;263424;262400;253056;263296 | |
22 | resnetv24_stage1_batchnorm5_fwd | BatchNorm | [256,64,56,56] | 14567 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 515.00 | 316669952 | 94199690.67 | 92085301.33 | 87.20 | 1.70 | 614.89 | true | 0.872361;0.871715;0.872132;0.871933;0.872109 | 316669952;316669952;316669952;316669952;316669952 | 86706208;93129120;93129056;96340896;115608480 | 88913536;92084928;88912128;95257440;114312640 | |
23 | resnetv24_stage1_activation5 | Activation | [256,64,56,56] | 11010.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.67 | 102760448 | 72789120.00 | 70624821.33 | 98.70 | 0.72 | 190.06 | true | 0.987222;0.987570;0.987448;0.987638;0.986761 | 102760448;102760448;102760448;102760448;102760448 | 67437120;61014432;86708640;64225696;86704544 | 64204320;60989568;89889280;64204288;83465856 | |
24 | resnetv24_stage1_conv6_fwd | Convolution | [256,64,56,56] | 1589274 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2292.33 | 26717716480 | 130590784.00 | 541783904.00 | 24.80 | 39.74 | 11655.25 | false | 0.248057;0.248040;0.248021;0.248044;0.248024 | 26717716480;26717716480;26717716480;26717716480;26717716480 | 128665696;133506944;133269248;129837408;128055200 | 537112096;542673792;542052384;540930112;542369216 | |
24 | resnetv24_stage1_conv6_fwd | Convolution | [256,64,56,56] | 1589274 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 24448.00 | 6.10 | 0.00 | 0.00 | true | 0.061176;0.061181;0.061284;0.061164;0.061176 | 0;0;0;0;0 | 21120;26240;25984;26240;20096 | 96;96;96;96;96 | |
25 | resnetv24_stage1__plus1 | elemwise_add | [256,256,56,56] | 52757.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 2982.33 | 205520896 | 199237962.67 | 499031936.00 | 97.60 | 0.29 | 68.91 | true | 0.975431;0.975472;0.975559;0.975531;0.975499 | 205520896;205520896;205520896;205520896;205520896 | 200353952;205128448;198935328;198424608;197072384 | 495076672;501816736;493713792;508346272;500202400 | |
26 | resnetv24_stage1_batchnorm6_fwd | BatchNorm | [256,256,56,56] | 57605 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2041.33 | 1266679808 | 145715840.00 | 579945760.00 | 87.40 | 1.75 | 620.52 | true | 0.874560;0.874485;0.874317;0.874798;0.874452 | 1266679808;1266679808;1266679808;1266679808;1266679808 | 148039328;149416640;144740224;143866400;144367968 | 581545120;581179552;578775008;574020672;579882720 | |
27 | resnetv24_stage1_activation6 | Activation | [256,256,56,56] | 42947 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2271.67 | 411041792 | 141281898.67 | 561686058.67 | 99.90 | 0.58 | 180.94 | true | 0.998987;0.998998;0.998986;0.999005;0.998986 | 411041792;411041792;411041792;411041792;411041792 | 558956640;563661824;567374304;555852000;562439712 | 143436768;143365120;141065376;138820128;139415200 | |
28 | resnetv24_stage1_conv7_fwd | Convolution | [256,256,56,56] | 1682053 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2068.00 | 26409435136 | 147901066.67 | 143844266.67 | 24.90 | 90.52 | 12770.52 | false | 0.248784;0.248955;0.248969;0.248955;0.248980 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 144504800;146284032;144623456;142404544;142376544 | 148207008;156293696;149886304;145609888;144940576 | |
28 | resnetv24_stage1_conv7_fwd | Convolution | [256,256,56,56] | 1682053 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 25984.00 | 6.10 | 0.00 | 0.00 | true | 0.061182;0.061200;0.061197;0.061184;0.061244 | 0;0;0;0;0 | 864;96;96;96;96 | 24960;25984;25984;25984;25984 | |
29 | resnetv24_stage1_batchnorm7_fwd | BatchNorm | [256,64,56,56] | 14534.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 515.00 | 316669952 | 101691466.67 | 101699040.00 | 87.30 | 1.56 | 614.89 | true | 0.872483;0.872121;0.872664;0.872646;0.872548 | 316669952;316669952;316669952;316669952;316669952 | 96339680;93128416;109184544;99550176;112401056 | 96342176;96286464;109210752;99544192;115610368 | |
30 | resnetv24_stage1_activation7 | Activation | [256,64,56,56] | 10781.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.33 | 102760448 | 63155178.67 | 65274112.00 | 98.70 | 0.80 | 189.48 | true | 0.987453;0.987438;0.987098;0.987557;0.987646 | 102760448;102760448;102760448;102760448;102760448 | 54592064;67436960;61014144;61014432;67436960 | 57791008;67413888;64202880;64205568;67416960 | |
31 | resnetv24_stage1_conv8_fwd | Convolution | [256,64,56,56] | 3975010 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 2955.00 | 31855738880 | 76913301.33 | 144317813.33 | 24.80 | 143.99 | 10780.28 | false | 0.247932;0.247947;0.247968;0.247956;0.247961 | 31855738880;31855738880;31855738880;31855738880;31855738880 | 81304064;77392736;75669920;77677248;73820768 | 148001216;145047488;144295840;143610112;143516064 | |
31 | resnetv24_stage1_conv8_fwd | Convolution | [256,64,56,56] | 3975010 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147648.00 | 290645.33 | 6.20 | 0.54 | 50.90 | true | 0.062368;0.062363;0.062360;0.062364;0.062357 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;147648 | 290304;290560;290432;290944;291072 | |
32 | resnetv24_stage1_batchnorm8_fwd | BatchNorm | [256,64,56,56] | 14521.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 515.00 | 316669952 | 89917024.00 | 88914122.67 | 87.20 | 1.77 | 614.89 | true | 0.872125;0.871540;0.872376;0.872256;0.872063 | 316669952;316669952;316669952;316669952;316669952 | 86705216;93132768;89917024;89917024;89917024 | 88910432;92095040;88912672;88917952;88911744 | |
33 | resnetv24_stage1_activation8 | Activation | [256,64,56,56] | 10754.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.67 | 102760448 | 68509674.67 | 67413973.33 | 98.70 | 0.76 | 190.06 | true | 0.987431;0.986939;0.986712;0.987109;0.987208 | 102760448;102760448;102760448;102760448;102760448 | 54591552;80282016;64232576;73859488;67436960 | 57786400;83456896;64204160;73833216;64204544 | |
34 | resnetv24_stage1_conv9_fwd | Convolution | [256,64,56,56] | 1582824.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2291.00 | 26717716480 | 129491296.00 | 538070592.00 | 24.80 | 40.02 | 11662.03 | false | 0.248048;0.248062;0.248049;0.248043;0.248061 | 26717716480;26717716480;26717716480;26717716480;26717716480 | 544950496;534988096;536974048;537476800;539760928 | 128472640;130394720;129380064;128699104;130536832 | |
34 | resnetv24_stage1_conv9_fwd | Convolution | [256,64,56,56] | 1582824.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 181.33 | 26240.00 | 6.10 | 0.00 | 0.00 | true | 0.061180;0.061182;0.061185;0.061256;0.061194 | 0;0;0;0;0 | 26368;26112;26240;26496;25984 | 96;96;352;352;96 | |
35 | resnetv24_stage1__plus2 | elemwise_add | [256,256,56,56] | 52519.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 2981.33 | 205520896 | 200242976.00 | 498001600.00 | 97.50 | 0.29 | 68.94 | true | 0.975285;0.975412;0.975344;0.975640;0.975486 | 205520896;205520896;205520896;205520896;205520896 | 500124576;502688416;497243040;496388384;496637184 | 197729600;201974048;198383040;200563072;201782816 | |
36 | resnetv24_stage2_batchnorm0_fwd | BatchNorm | [256,256,56,56] | 57874.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2042.00 | 1266679808 | 144742389.33 | 576523125.33 | 87.40 | 1.76 | 620.31 | true | 0.874727;0.874446;0.874656;0.874351;0.874395 | 1266679808;1266679808;1266679808;1266679808;1266679808 | 145747200;144978400;144608192;144057952;144640576 | 579794848;566697536;574405984;576771712;578391680 | |
37 | resnetv24_stage2_activation0 | Activation | [256,256,56,56] | 42801.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2283.33 | 411041792 | 139474250.67 | 556375850.67 | 99.90 | 0.59 | 180.02 | true | 0.999002;0.998999;0.998993;0.998987;0.998992 | 411041792;411041792;411041792;411041792;411041792 | 138693312;138389216;139293728;142163488;140435712 | 557994176;556547360;554586016;561620096;551766944 | |
38 | resnetv24_stage2_conv0_fwd | Convolution | [256,256,56,56] | 3119071.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 4085.67 | 52818870272 | 146208554.67 | 180117493.33 | 17.90 | 161.86 | 12927.85 | false | 0.178843;0.181072;0.176511;0.181847;0.176697 | 52818870272;52818870272;52818870272;52818870272;52818870272 | 146955008;146236640;145434016;149231168;145339072 | 179726208;181016480;179459232;180873216;179753056 | |
38 | resnetv24_stage2_conv0_fwd | Convolution | [256,256,56,56] | 3119071.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 26112.00 | 6.10 | 0.00 | 0.00 | true | 0.061176;0.061174;0.061205;0.061165;0.061179 | 0;0;0;0;0 | 96;96;96;1120;96 | 25984;25984;26112;26240;26240 | |
39 | resnetv24_stage2_batchnorm1_fwd | BatchNorm | [256,128,56,56] | 29423.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1023.00 | 633339904 | 61087872.00 | 122720458.67 | 87.30 | 3.45 | 619.10 | true | 0.873322;0.873616;0.873162;0.873852;0.873308 | 633339904;633339904;633339904;633339904;633339904 | 122262464;117650368;123443200;123612384;122455712 | 60872992;58533312;61445184;61522816;60945440 | |
40 | resnetv24_stage2_activation1 | Activation | [256,128,56,56] | 21649.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1094.33 | 205520896 | 66022485.33 | 132749728.00 | 99.10 | 1.03 | 187.80 | true | 0.991607;0.991549;0.991316;0.991340;0.991479 | 205520896;205520896;205520896;205520896;205520896 | 65924832;64964896;66294848;65847776;66822528 | 132540096;130623072;133276224;132432864;134322784 | |
41 | resnetv24_stage2_conv1_fwd | Convolution | [256,128,56,56] | 3588193 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x128_relu_small_nn_v1 | 4576.67 | 59241398272 | 33971466.67 | 38359445.33 | 15.10 | 819.03 | 12944.22 | false | 0.151725;0.150950;0.169441;0.151534;0.150994 | 59241398272;59241398272;59241398272;59241398272;59241398272 | 33227712;34372416;33932960;36567616;33609024 | 38116000;38740000;38222336;40266816;37896608 | |
41 | resnetv24_stage2_conv1_fwd | Convolution | [256,128,56,56] | 3588193 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7253.33 | 5.90 | 0.00 | 0.00 | true | 0.059368;0.059379;0.059363;0.059369;0.059357 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7168;7424;6912;7424 | |
42 | resnetv24_stage2_batchnorm2_fwd | BatchNorm | [256,128,28,28] | 7214 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.67 | 162529280 | 102763573.33 | 102761226.67 | 83.50 | 0.79 | 623.51 | true | 0.835275;0.834459;0.834199;0.834828;0.835125 | 162529280;162529280;162529280;162529280;162529280 | 102763744;102763744;102763616;102763104;102763360 | 102773952;102755616;102770720;102757344;102746720 | |
43 | resnetv24_stage2_activation2 | Activation | [256,128,28,28] | 5482.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.33 | 51380224 | 102760800.00 | 102532480.00 | 97.50 | 0.25 | 190.77 | true | 0.974675;0.975728;0.975408;0.975113;0.974895 | 51380224;51380224;51380224;51380224;51380224 | 102543168;102534720;102529024;102533696;102521536 | 102760800;102760800;102760800;102760800;102760800 | |
44 | resnetv24_stage2_conv2_fwd | Convolution | [256,128,28,28] | 1483926.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2113.00 | 26512195584 | 140926570.67 | 279792917.33 | 24.90 | 63.02 | 12547.18 | false | 0.248636;0.248689;0.248665;0.248690;0.248681 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 140059744;150841728;140229408;142490560;139842176 | 276201984;289302816;282552384;280001888;276824480 | |
44 | resnetv24_stage2_conv2_fwd | Convolution | [256,128,28,28] | 1483926.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 7381.33 | 5.90 | 0.00 | 0.00 | true | 0.059360;0.059454;0.059401;0.059374;0.059408 | 0;0;0;0;0 | 96;2144;96;96;96 | 6528;11264;7552;7168;7424 | |
45 | resnetv24_stage2_conv3_fwd | Convolution | [256,256,56,56] | 2928076.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x128_relu_interior_nn_v1 | 4341.33 | 52818870272 | 156026901.33 | 166587061.33 | 16.20 | 163.72 | 12166.51 | false | 0.162128;0.161429;0.162250;0.161787;0.163499 | 52818870272;52818870272;52818870272;52818870272;52818870272 | 154093664;160714368;152429632;158888384;155098656 | 165847040;174621376;165865568;167600352;166295264 | |
45 | resnetv24_stage2_conv3_fwd | Convolution | [256,256,56,56] | 2928076.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 6378.67 | 5.90 | 0.00 | 0.00 | true | 0.059351;0.059379;0.059363;0.059362;0.059368 | 0;0;0;0;0 | 5248;5056;8128;5760;8448 | 96;96;96;96;96 | |
46 | resnetv24_stage2__plus0 | elemwise_add | [256,512,28,28] | 26177.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1476.00 | 102760448 | 202778218.67 | 210210432.00 | 96.50 | 0.25 | 69.62 | true | 0.965281;0.964741;0.964803;0.964907;0.965069 | 102760448;102760448;102760448;102760448;102760448 | 201947552;205552288;206704448;200834816;199935168 | 201128672;209456928;210104000;211070368;211087584 | |
47 | resnetv24_stage2_batchnorm3_fwd | BatchNorm | [256,512,28,28] | 28820 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1037.67 | 650117120 | 62652053.33 | 125741685.33 | 84.70 | 3.45 | 626.52 | true | 0.846729;0.846695;0.846689;0.846586;0.846568 | 650117120;650117120;650117120;650117120;650117120 | 126991360;124072064;118227968;126161632;127163264 | 63277120;61832896;58890656;62846144;63371968 | |
48 | resnetv24_stage2_activation3 | Activation | [256,512,28,28] | 21670.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1098.00 | 205520896 | 65817973.33 | 132302997.33 | 99.20 | 1.04 | 187.18 | true | 0.991732;0.991899;0.991478;0.991514;0.991763 | 205520896;205520896;205520896;205520896;205520896 | 67117408;62723424;65834304;65395840;66223776 | 134934432;126155840;132321280;131469792;133117920 | |
49 | resnetv24_stage2_conv4_fwd | Convolution | [256,512,28,28] | 1563816 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2108.33 | 26358054912 | 142807786.67 | 66226709.33 | 24.90 | 126.09 | 12501.85 | false | 0.248727;0.249200;0.248786;0.248906;0.248774 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 141049120;142819264;142682080;145541472;142922016 | 66146944;66093376;66404000;66351360;66181824 | |
49 | resnetv24_stage2_conv4_fwd | Convolution | [256,512,28,28] | 1563816 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 7168.00 | 5.90 | 0.00 | 0.00 | true | 0.059361;0.059685;0.059485;0.059410;0.059397 | 0;0;0;0;0 | 96;96;96;96;96 | 7424;7168;7168;7168;7168 | |
50 | resnetv24_stage2_batchnorm4_fwd | BatchNorm | [256,128,28,28] | 7196.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.33 | 162529280 | 102763317.33 | 102721909.33 | 83.50 | 0.79 | 624.31 | true | 0.834991;0.834516;0.835353;0.835064;0.835068 | 162529280;162529280;162529280;162529280;162529280 | 102763360;102763104;102763232;102763360;102763360 | 102705184;102746848;102687200;102744288;102716256 | |
51 | resnetv24_stage2_activation4 | Activation | [256,128,28,28] | 5445.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.00 | 51380224 | 102760800.00 | 102526741.33 | 97.50 | 0.25 | 190.30 | true | 0.975123;0.975350;0.975354;0.974985;0.974269 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102761056;102760800;102760800;102760800 | 102535936;102517312;102530432;102523008;102526784 | |
52 | resnetv24_stage2_conv5_fwd | Convolution | [256,128,28,28] | 3561917.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 1178.00 | 14060093440 | 47425088.00 | 63904330.67 | 17.00 | 126.29 | 11935.56 | false | 0.170995;0.169995;0.168790;0.168439;0.170231 | 14060093440;14060093440;14060093440;14060093440;14060093440 | 46992224;47137024;48146016;46673984;48988544 | 63412224;63535488;64765280;62983712;64893920 | |
52 | resnetv24_stage2_conv5_fwd | Convolution | [256,128,28,28] | 3561917.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 362.00 | 739770368 | 103573962.67 | 142674058.67 | 47.90 | 3.00 | 2043.56 | true | 0.478735;0.478423;0.479051;0.479239;0.477841 | 739770368;739770368;739770368;739770368;739770368 | 103550048;103571168;103560416;103590304;103601056 | 142589536;142721920;142710720;142537696;142729792 | |
52 | resnetv24_stage2_conv5_fwd | Convolution | [256,128,28,28] | 3561917.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 341.00 | 704643072 | 142740800.00 | 110687264.00 | 47.60 | 2.78 | 2066.40 | true | 0.475512;0.477088;0.476688;0.476193;0.475387 | 704643072;704643072;704643072;704643072;704643072 | 142741216;142744416;142730272;142736768;142748384 | 110626304;110645856;110692960;110722976;110801024 | |
52 | resnetv24_stage2_conv5_fwd | Convolution | [256,128,28,28] | 3561917.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 114.67 | 369885184 | 619616.00 | 70792426.67 | 46.70 | 5.18 | 3225.73 | true | 0.467817;0.467083;0.466639;0.465567;0.465844 | 369885184;369885184;369885184;369885184;369885184 | 619936;618528;619488;619552;619808 | 70791744;70779840;70800576;70784960;70834496 | |
53 | resnetv24_stage2_batchnorm5_fwd | BatchNorm | [256,128,28,28] | 8202.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 261.00 | 162529280 | 102764853.33 | 103018666.67 | 83.50 | 0.79 | 622.72 | true | 0.835131;0.835374;0.835724;0.835372;0.834442 | 162529280;162529280;162529280;162529280;162529280 | 102764768;102765408;102764768;102764384;102765024 | 103028160;103005376;103012672;103023648;103019680 | |
54 | resnetv24_stage2_activation5 | Activation | [256,128,28,28] | 5425.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.33 | 51380224 | 102761056.00 | 102530698.67 | 97.50 | 0.25 | 190.77 | true | 0.975140;0.974813;0.973675;0.975158;0.975324 | 51380224;51380224;51380224;51380224;51380224 | 102761056;102761056;102761056;102761056;102761056 | 102534272;102533888;102530528;102521024;102527680 | |
55 | resnetv24_stage2_conv6_fwd | Convolution | [256,128,28,28] | 1495550.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2113.67 | 26512195584 | 142995722.67 | 280501088.00 | 24.90 | 62.60 | 12543.22 | false | 0.248673;0.248683;0.248670;0.248686;0.248676 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 141762784;147514976;140779136;142366624;144857760 | 279531808;285605536;278264128;279539648;282431808 | |
55 | resnetv24_stage2_conv6_fwd | Convolution | [256,128,28,28] | 1495550.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 1888.00 | 8960.00 | 6.00 | 0.00 | 0.00 | true | 0.059755;0.060118;0.060097;0.060087;0.060154 | 0;0;0;0;0 | 1888;1888;1888;1888;1888 | 8960;8960;8960;20224;8960 | |
56 | resnetv24_stage2__plus1 | elemwise_add | [256,512,28,28] | 26235 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1477.67 | 102760448 | 200863306.67 | 208511882.67 | 96.50 | 0.25 | 69.54 | true | 0.964761;0.964908;0.964965;0.964792;0.964921 | 102760448;102760448;102760448;102760448;102760448 | 200854016;200886080;200849824;200376128;204870304 | 210604736;200656032;210821952;211160352;204108960 | |
57 | resnetv24_stage2_batchnorm6_fwd | BatchNorm | [256,512,28,28] | 28913.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1036.33 | 650117120 | 59786101.33 | 120009760.00 | 84.70 | 3.62 | 627.32 | true | 0.846665;0.846966;0.846959;0.846870;0.846872 | 650117120;650117120;650117120;650117120;650117120 | 43298240;53438656;62892544;63107840;63027104 | 87048448;107308576;126207936;126637888;126512768 | |
58 | resnetv24_stage2_activation6 | Activation | [256,512,28,28] | 21505 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1091.00 | 205520896 | 65545066.67 | 131783424.00 | 99.20 | 1.04 | 188.38 | true | 0.991701;0.991312;0.991842;0.991327;0.991646 | 205520896;205520896;205520896;205520896;205520896 | 61076000;65300896;66239872;65094432;67293024 | 122808640;131300384;133158848;130891040;135270208 | |
59 | resnetv24_stage2_conv7_fwd | Convolution | [256,512,28,28] | 1555205.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2108.33 | 26358054912 | 143548608.00 | 66187712.00 | 24.90 | 125.67 | 12501.85 | false | 0.248822;0.249111;0.248984;0.248656;0.249008 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 144859936;144834176;142600896;143210752;142577216 | 66168640;70321888;66138944;66255552;66060640 | |
59 | resnetv24_stage2_conv7_fwd | Convolution | [256,512,28,28] | 1555205.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 352.00 | 7509.33 | 5.90 | 0.00 | 0.00 | true | 0.059356;0.059364;0.059370;0.059404;0.059372 | 0;0;0;0;0 | 352;352;352;352;4448 | 7424;7680;7424;7424;15616 | |
60 | resnetv24_stage2_batchnorm7_fwd | BatchNorm | [256,128,28,28] | 7202 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.00 | 162529280 | 102763530.67 | 102731242.67 | 83.50 | 0.79 | 625.11 | true | 0.835167;0.835599;0.834444;0.835016;0.835164 | 162529280;162529280;162529280;162529280;162529280 | 102718688;102738656;102736384;102757408;102707552 | 102763616;102763232;102767840;102763744;102762976 | |
61 | resnetv24_stage2_activation7 | Activation | [256,128,28,28] | 5462.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.00 | 51380224 | 102760800.00 | 102536597.33 | 97.50 | 0.25 | 190.30 | true | 0.975385;0.975901;0.975540;0.975074;0.974723 | 51380224;51380224;51380224;51380224;51380224 | 102562304;102540800;102524736;102532352;102536640 | 102767456;102760800;102760800;102760800;102760800 | |
62 | resnetv24_stage2_conv8_fwd | Convolution | [256,128,28,28] | 3554066 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 1170.67 | 14060093440 | 47402912.00 | 63821941.33 | 16.80 | 126.41 | 12010.33 | false | 0.166213;0.169484;0.168445;0.165055;0.170582 | 14060093440;14060093440;14060093440;14060093440;14060093440 | 63467840;63255296;57120224;64742688;65812960 | 47109184;47004832;42181632;48094720;48960992 | |
62 | resnetv24_stage2_conv8_fwd | Convolution | [256,128,28,28] | 3554066 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 358.33 | 739770368 | 103549600.00 | 142682613.33 | 47.90 | 3.00 | 2064.48 | true | 0.479372;0.478959;0.479431;0.479465;0.478448 | 739770368;739770368;739770368;739770368;739770368 | 142628544;142761216;142699360;142591584;142719936 | 103558880;103531488;103529632;103558432;103571808 | |
62 | resnetv24_stage2_conv8_fwd | Convolution | [256,128,28,28] | 3554066 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 330.33 | 704643072 | 142697621.33 | 110577162.67 | 47.40 | 2.78 | 2133.13 | true | 0.474605;0.474195;0.473983;0.473883;0.475101 | 704643072;704643072;704643072;704643072;704643072 | 142695616;142693504;142705120;142694976;142702272 | 110574528;110521120;110620960;110536000;110622240 | |
62 | resnetv24_stage2_conv8_fwd | Convolution | [256,128,28,28] | 3554066 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 106.67 | 369885184 | 595594.67 | 70791189.33 | 46.50 | 5.18 | 3467.66 | true | 0.464758;0.464005;0.464468;0.465242;0.466320 | 369885184;369885184;369885184;369885184;369885184 | 70719648;70845632;70782016;70778432;70813120 | 597344;594976;595232;595488;596064 | |
63 | resnetv24_stage2_batchnorm8_fwd | BatchNorm | [256,128,28,28] | 7297.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 261.00 | 162529280 | 102764128.00 | 103014080.00 | 83.50 | 0.79 | 622.72 | true | 0.835531;0.835528;0.835623;0.835440;0.834696 | 162529280;162529280;162529280;162529280;162529280 | 102764000;102764384;102763744;102764256;102764128 | 103014048;103015008;103013184;103018752;103013152 | |
64 | resnetv24_stage2_activation8 | Activation | [256,128,28,28] | 5438 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.33 | 51380224 | 102760800.00 | 102531712.00 | 97.50 | 0.25 | 190.06 | true | 0.975027;0.974629;0.976451;0.974320;0.975871 | 51380224;51380224;51380224;51380224;51380224 | 102528704;102533184;102530624;102531328;102540224 | 102760800;102760800;102760800;102760800;102760800 | |
65 | resnetv24_stage2_conv9_fwd | Convolution | [256,128,28,28] | 1480062.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2112.33 | 26512195584 | 140676256.00 | 279737824.00 | 24.90 | 63.06 | 12551.14 | false | 0.248683;0.248663;0.248675;0.248641;0.248701 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 139689888;140666880;140492768;150139840;140869120 | 275655584;278081216;281890624;286937984;279241632 | |
65 | resnetv24_stage2_conv9_fwd | Convolution | [256,128,28,28] | 1480062.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7253.33 | 5.90 | 0.00 | 0.00 | true | 0.059378;0.059397;0.059463;0.059357;0.059360 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7168;7424;7168;9728 | |
66 | resnetv24_stage2__plus2 | elemwise_add | [256,512,28,28] | 26492 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1476.00 | 102760448 | 200024405.33 | 208416064.00 | 96.50 | 0.25 | 69.62 | true | 0.965031;0.965156;0.965017;0.964814;0.965093 | 102760448;102760448;102760448;102760448;102760448 | 199670752;203798720;200147104;199456192;200255360 | 210325696;209751200;203851360;205332480;210164512 | |
67 | resnetv24_stage2_batchnorm9_fwd | BatchNorm | [256,512,28,28] | 28909 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1036.33 | 650117120 | 62067082.67 | 124606005.33 | 84.60 | 3.48 | 627.32 | true | 0.846463;0.846373;0.846092;0.846049;0.846274 | 650117120;650117120;650117120;650117120;650117120 | 61428608;62246208;62452768;61502272;62839776 | 123329792;124959904;125371776;123486336;126129984 | |
68 | resnetv24_stage2_activation9 | Activation | [256,512,28,28] | 21810.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1096.67 | 205520896 | 66034773.33 | 132731968.00 | 99.10 | 1.03 | 187.41 | true | 0.991500;0.991728;0.991441;0.991399;0.991541 | 205520896;205520896;205520896;205520896;205520896 | 134505536;131754976;131015520;132948288;133492640 | 66918976;65557120;65165440;66140992;66406208 | |
69 | resnetv24_stage2_conv10_fwd | Convolution | [256,512,28,28] | 1562225.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2107.67 | 26358054912 | 145500725.33 | 67866890.67 | 24.90 | 123.53 | 12505.80 | false | 0.248789;0.249186;0.248813;0.249215;0.248978 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 144398400;149844288;142068160;143364640;148739136 | 66365536;69424928;66085984;71263968;67810208 | |
69 | resnetv24_stage2_conv10_fwd | Convolution | [256,512,28,28] | 1562225.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 7168.00 | 5.90 | 0.00 | 0.00 | true | 0.059350;0.059378;0.059375;0.059400;0.059362 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7168;7168;5632;7168 | |
70 | resnetv24_stage2_batchnorm10_fwd | BatchNorm | [256,128,28,28] | 7186.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.00 | 162529280 | 102763189.33 | 102712800.00 | 83.50 | 0.79 | 625.11 | true | 0.834869;0.834688;0.835038;0.835221;0.835274 | 162529280;162529280;162529280;162529280;162529280 | 102762976;102763488;102763104;102763104;102763360 | 102728096;102719648;102710368;102708384;102689920 | |
71 | resnetv24_stage2_activation10 | Activation | [256,128,28,28] | 5457 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.00 | 51380224 | 102760800.00 | 102536554.67 | 97.50 | 0.25 | 190.30 | true | 0.974533;0.975268;0.974903;0.975699;0.975324 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102761056;102760800;102760800 | 102538368;102542848;102528416;102536256;102535040 | |
72 | resnetv24_stage2_conv11_fwd | Convolution | [256,128,28,28] | 3564614.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 1171.67 | 14060093440 | 43726794.67 | 58919626.67 | 16.70 | 136.98 | 12000.08 | false | 0.168733;0.169193;0.166530;0.166873;0.166588 | 14060093440;14060093440;14060093440;14060093440;14060093440 | 42002752;38968544;47118048;45879360;43298272 | 56875808;51618848;63532928;61302048;58581024 | |
72 | resnetv24_stage2_conv11_fwd | Convolution | [256,128,28,28] | 3564614.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 359.33 | 739770368 | 103542005.33 | 142623520.00 | 47.90 | 3.01 | 2058.73 | true | 0.479663;0.479236;0.479463;0.478966;0.478669 | 739770368;739770368;739770368;739770368;739770368 | 103543328;103548320;103554272;103531360;103534368 | 142807040;142601088;142591840;142677632;142422240 | |
72 | resnetv24_stage2_conv11_fwd | Convolution | [256,128,28,28] | 3564614.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 332.00 | 704643072 | 142700810.67 | 110554261.33 | 47.40 | 2.78 | 2122.42 | true | 0.474499;0.473108;0.475708;0.474200;0.473908 | 704643072;704643072;704643072;704643072;704643072 | 142685984;142697824;142707232;142701216;142703392 | 110367776;110582656;110528000;110567104;110567680 | |
72 | resnetv24_stage2_conv11_fwd | Convolution | [256,128,28,28] | 3564614.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 107.33 | 369885184 | 595530.67 | 70813802.67 | 46.60 | 5.18 | 3446.15 | true | 0.465029;0.466899;0.466326;0.465993;0.465830 | 369885184;369885184;369885184;369885184;369885184 | 594976;595808;595808;594592;596448 | 70794688;70810432;70813632;70817344;70835008 | |
73 | resnetv24_stage2_batchnorm11_fwd | BatchNorm | [256,128,28,28] | 7266.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 261.00 | 162529280 | 102764426.67 | 103019285.33 | 83.50 | 0.79 | 622.72 | true | 0.834893;0.834488;0.834773;0.834553;0.835340 | 162529280;162529280;162529280;162529280;162529280 | 102764256;102764768;102764512;102764256;102764512 | 103020640;103008256;103015360;103029152;103021856 | |
74 | resnetv24_stage2_activation11 | Activation | [256,128,28,28] | 5455.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.00 | 51380224 | 102760800.00 | 102532245.33 | 97.50 | 0.25 | 190.30 | true | 0.974482;0.974357;0.974964;0.975568;0.975134 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102531584;102538400;102535936;102529216;102528448 | |
75 | resnetv24_stage2_conv12_fwd | Convolution | [256,128,28,28] | 1490271.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2113.67 | 26512195584 | 141202666.67 | 281189066.67 | 24.90 | 62.77 | 12543.22 | false | 0.248695;0.248680;0.248691;0.248697;0.248691 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 139796608;142058240;141352320;140197440;165717600 | 281376704;283623808;278566688;277563104;291026656 | |
75 | resnetv24_stage2_conv12_fwd | Convolution | [256,128,28,28] | 1490271.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7168.00 | 5.90 | 0.00 | 0.00 | true | 0.059373;0.059386;0.059366;0.059366;0.059705 | 0;0;0;0;0 | 96;96;96;96;96 | 7424;7168;7168;7168;7168 | |
76 | resnetv24_stage2__plus3 | elemwise_add | [256,512,28,28] | 26260.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1476.67 | 102760448 | 201028864.00 | 208236010.67 | 96.50 | 0.25 | 69.59 | true | 0.964829;0.964797;0.964935;0.964663;0.964848 | 102760448;102760448;102760448;102760448;102760448 | 199912064;201644000;201206048;203608000;200236544 | 203585408;203400960;210128512;210994112;211212256 | |
77 | resnetv24_stage2_batchnorm12_fwd | BatchNorm | [256,512,28,28] | 28989 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1036.00 | 650117120 | 61527520.00 | 123504352.00 | 84.70 | 3.51 | 627.53 | true | 0.846614;0.846593;0.846170;0.846493;0.846481 | 650117120;650117120;650117120;650117120;650117120 | 122448768;124679264;123385024;126407392;121948704 | 60992896;62117120;61472544;62971360;60741024 | |
78 | resnetv24_stage2_activation12 | Activation | [256,512,28,28] | 21618.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1096.00 | 205520896 | 66415733.33 | 133512042.67 | 99.10 | 1.03 | 187.52 | true | 0.991341;0.991269;0.991470;0.991680;0.991287 | 205520896;205520896;205520896;205520896;205520896 | 66571040;64062656;66813568;65924640;66751520 | 133834496;128803808;134344384;132528416;134173216 | |
79 | resnetv24_stage2_conv13_fwd | Convolution | [256,512,28,28] | 1558435.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2109.00 | 26358054912 | 144751498.67 | 66301280.00 | 24.90 | 124.89 | 12497.89 | false | 0.248546;0.248330;0.248483;0.249133;0.249149 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 141666976;143435520;146223168;146693856;144595808 | 66139136;66188992;73517408;66391040;66323808 | |
79 | resnetv24_stage2_conv13_fwd | Convolution | [256,512,28,28] | 1558435.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 7253.33 | 5.90 | 0.00 | 0.00 | true | 0.059357;0.059363;0.059358;0.059391;0.059374 | 0;0;0;0;0 | 352;96;96;96;96 | 7040;7168;7424;7168;7424 | |
80 | resnetv24_stage2_batchnorm13_fwd | BatchNorm | [256,128,28,28] | 7217 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.67 | 162529280 | 102762933.33 | 102717866.67 | 83.50 | 0.79 | 623.51 | true | 0.834536;0.835111;0.835305;0.834911;0.835117 | 162529280;162529280;162529280;162529280;162529280 | 102762848;102762976;102762848;102765536;102762976 | 102709984;102716768;102716672;102720160;102753856 | |
81 | resnetv24_stage2_activation13 | Activation | [256,128,28,28] | 5421 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.33 | 51380224 | 102760800.00 | 102536000.00 | 97.50 | 0.25 | 190.06 | true | 0.974610;0.975525;0.974807;0.974815;0.975369 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102534720;102530432;102541248;102545600;102532032 | |
82 | resnetv24_stage2_conv14_fwd | Convolution | [256,128,28,28] | 3557291.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 1173.00 | 14060093440 | 46426837.33 | 61542613.33 | 16.80 | 130.22 | 11986.44 | false | 0.166894;0.170289;0.166883;0.167256;0.169848 | 14060093440;14060093440;14060093440;14060093440;14060093440 | 46779232;46988736;45547712;44403648;46953568 | 61988896;62759296;60481792;59478944;62157152 | |
82 | resnetv24_stage2_conv14_fwd | Convolution | [256,128,28,28] | 3557291.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 358.00 | 739770368 | 103533685.33 | 142653813.33 | 47.90 | 3.00 | 2066.40 | true | 0.479046;0.479217;0.478387;0.479561;0.479128 | 739770368;739770368;739770368;739770368;739770368 | 103576096;103512096;103470944;103564320;103524640 | 142641984;142637472;142622176;142681984;142829664 | |
82 | resnetv24_stage2_conv14_fwd | Convolution | [256,128,28,28] | 3557291.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 332.67 | 704643072 | 142696490.67 | 110572458.67 | 47.40 | 2.78 | 2118.16 | true | 0.473785;0.474897;0.474391;0.474879;0.473439 | 704643072;704643072;704643072;704643072;704643072 | 142683232;142708544;142699648;142706592;142679584 | 110462176;110688704;110605856;110585248;110526272 | |
82 | resnetv24_stage2_conv14_fwd | Convolution | [256,128,28,28] | 3557291.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 106.33 | 369885184 | 595360.00 | 70796906.67 | 46.50 | 5.18 | 3478.55 | true | 0.464132;0.464942;0.465385;0.464799;0.466305 | 369885184;369885184;369885184;369885184;369885184 | 595104;596320;594912;595616;595360 | 70824896;70710464;70817088;70754368;70819264 | |
83 | resnetv24_stage2_batchnorm14_fwd | BatchNorm | [256,128,28,28] | 7285 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.67 | 162529280 | 102763616.00 | 103007893.33 | 83.60 | 0.79 | 623.51 | true | 0.835734;0.835546;0.836300;0.834691;0.835846 | 162529280;162529280;162529280;162529280;162529280 | 102763488;102763616;102763872;102763360;102763744 | 103016544;103000704;103001344;103005792;103019744 | |
84 | resnetv24_stage2_activation14 | Activation | [256,128,28,28] | 5422.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.67 | 51380224 | 102760800.00 | 102538965.33 | 97.50 | 0.25 | 190.53 | true | 0.975463;0.975274;0.975162;0.975017;0.975157 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102766688;102760800;102760800;102760800 | 102537344;102551360;102540928;102533440;102538624 | |
85 | resnetv24_stage2_conv15_fwd | Convolution | [256,128,28,28] | 1481055.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2112.00 | 26512195584 | 140624490.67 | 277004917.33 | 24.90 | 63.48 | 12553.12 | false | 0.248686;0.248681;0.248689;0.248700;0.248682 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 277220896;281439968;276981088;276767328;276812768 | 140845728;143363776;140588672;139826432;140439072 | |
85 | resnetv24_stage2_conv15_fwd | Convolution | [256,128,28,28] | 1481055.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7253.33 | 5.90 | 0.00 | 0.00 | true | 0.059365;0.059394;0.059666;0.059366;0.059368 | 0;0;0;0;0 | 96;96;1888;96;96 | 7168;6528;10624;7424;7168 | |
86 | resnetv24_stage2__plus4 | elemwise_add | [256,512,28,28] | 26367.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1475.67 | 102760448 | 200875829.33 | 210792522.67 | 96.50 | 0.25 | 69.64 | true | 0.964962;0.964864;0.965309;0.965271;0.964757 | 102760448;102760448;102760448;102760448;102760448 | 202643712;201509600;199986816;200723264;200394624 | 210772800;209893408;210863584;210741184;210906848 | |
87 | resnetv24_stage2_batchnorm15_fwd | BatchNorm | [256,512,28,28] | 28728.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1036.67 | 650117120 | 62008981.33 | 124467125.33 | 84.60 | 3.49 | 627.12 | true | 0.846330;0.846318;0.846481;0.846155;0.846122 | 650117120;650117120;650117120;650117120;650117120 | 126961376;123650304;125558688;124192384;122221280 | 63257984;61611168;62554240;61861536;60890176 | |
88 | resnetv24_stage2_activation15 | Activation | [256,512,28,28] | 21628 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1089.67 | 205520896 | 65207424.00 | 131119850.67 | 99.20 | 1.05 | 188.61 | true | 0.991378;0.991722;0.991557;0.991498;0.991534 | 205520896;205520896;205520896;205520896;205520896 | 63484544;64028544;66028576;65886816;65706912 | 127670848;128769664;132761504;132504000;132085888 | |
89 | resnetv24_stage2_conv16_fwd | Convolution | [256,512,28,28] | 1562123 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2108.00 | 26358054912 | 145485696.00 | 66343402.67 | 24.90 | 124.43 | 12503.82 | false | 0.249025;0.248841;0.248858;0.248931;0.248750 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 145009888;145229440;147297952;141697088;146217760 | 66373152;66248544;66909536;66072192;66408512 | |
89 | resnetv24_stage2_conv16_fwd | Convolution | [256,512,28,28] | 1562123 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7168.00 | 5.90 | 0.00 | 0.00 | true | 0.059366;0.059363;0.059356;0.059396;0.059353 | 0;0;0;0;0 | 96;96;96;96;96 | 7424;7168;7168;7168;7168 | |
90 | resnetv24_stage2_batchnorm16_fwd | BatchNorm | [256,128,28,28] | 7202 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.33 | 162529280 | 102763146.67 | 102705173.33 | 83.50 | 0.79 | 624.31 | true | 0.834571;0.834490;0.834507;0.834962;0.834888 | 162529280;162529280;162529280;162529280;162529280 | 102762976;102763360;102763104;102762848;102763616 | 102704704;102696480;102716768;102708704;102702112 | |
91 | resnetv24_stage2_activation16 | Activation | [256,128,28,28] | 5425.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.67 | 51380224 | 102760800.00 | 102529653.33 | 97.50 | 0.25 | 190.53 | true | 0.974953;0.975412;0.974273;0.974655;0.974564 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102761312;102760800 | 102530592;102540384;102528512;102526208;102529856 | |
92 | resnetv24_stage2_conv17_fwd | Convolution | [256,128,28,28] | 3563309 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 1171.00 | 14060093440 | 43732576.00 | 58810314.67 | 16.80 | 137.11 | 12006.91 | false | 0.167308;0.167661;0.169505;0.166177;0.168067 | 14060093440;14060093440;14060093440;14060093440;14060093440 | 64152768;55199648;52023968;64597792;57078528 | 48003392;40683008;38216928;48003648;42511328 | |
92 | resnetv24_stage2_conv17_fwd | Convolution | [256,128,28,28] | 3563309 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 358.33 | 739770368 | 103550602.67 | 142677226.67 | 47.90 | 3.00 | 2064.48 | true | 0.477212;0.479512;0.479746;0.478297;0.479525 | 739770368;739770368;739770368;739770368;739770368 | 103571232;103541856;103557600;103552352;103539552 | 142629280;142872544;142584512;142817888;142573920 | |
92 | resnetv24_stage2_conv17_fwd | Convolution | [256,128,28,28] | 3563309 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 332.33 | 704643072 | 142694602.67 | 110560256.00 | 47.50 | 2.78 | 2120.29 | true | 0.476026;0.475193;0.474842;0.474232;0.474545 | 704643072;704643072;704643072;704643072;704643072 | 142703456;142684032;142696256;142721376;142684096 | 110579904;110410144;110580480;110673216;110520384 | |
92 | resnetv24_stage2_conv17_fwd | Convolution | [256,128,28,28] | 3563309 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 107.67 | 369885184 | 595338.67 | 70765290.67 | 46.50 | 5.18 | 3435.46 | true | 0.464858;0.465755;0.465696;0.464866;0.465809 | 369885184;369885184;369885184;369885184;369885184 | 594400;595872;596256;594784;595360 | 70788160;70671808;70755776;70807616;70751936 | |
93 | resnetv24_stage2_batchnorm17_fwd | BatchNorm | [256,128,28,28] | 7262.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 261.00 | 162529280 | 102764042.67 | 103011178.67 | 83.50 | 0.79 | 622.72 | true | 0.835200;0.835449;0.835432;0.834643;0.835412 | 162529280;162529280;162529280;162529280;162529280 | 102764000;102764000;102773600;102764128;102763872 | 103011232;103026176;103012288;102998048;103010016 | |
94 | resnetv24_stage2_activation17 | Activation | [256,128,28,28] | 5445.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.33 | 51380224 | 102760800.00 | 102536757.33 | 97.50 | 0.25 | 190.77 | true | 0.975120;0.975535;0.975521;0.975498;0.975011 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102538528;102528192;102540416;102533888;102537856 | |
95 | resnetv24_stage2_conv18_fwd | Convolution | [256,128,28,28] | 1494036.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2113.33 | 26512195584 | 141528458.67 | 279715178.67 | 24.90 | 62.94 | 12545.20 | false | 0.248680;0.248698;0.248684;0.248665;0.248696 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 142944160;140859520;140781696;143772800;140602496 | 280252832;291211296;277795072;281097632;277372992 | |
95 | resnetv24_stage2_conv18_fwd | Convolution | [256,128,28,28] | 1494036.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7338.67 | 5.90 | 0.00 | 0.00 | true | 0.059385;0.059392;0.059361;0.059376;0.059372 | 0;0;0;0;0 | 96;2400;96;96;96 | 7680;15872;7168;7168;6528 | |
96 | resnetv24_stage2__plus5 | elemwise_add | [256,512,28,28] | 26242.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1477.00 | 102760448 | 202355509.33 | 209671242.67 | 96.50 | 0.25 | 69.57 | true | 0.964718;0.964757;0.964723;0.965055;0.964939 | 102760448;102760448;102760448;102760448;102760448 | 199549696;201825440;200913024;204513472;204328064 | 211190816;201027552;210739872;211254464;207083040 | |
97 | resnetv24_stage2_batchnorm18_fwd | BatchNorm | [256,512,28,28] | 28903.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1036.33 | 650117120 | 61792245.33 | 124030986.67 | 84.60 | 3.50 | 627.32 | true | 0.846152;0.846196;0.846204;0.846548;0.846505 | 650117120;650117120;650117120;650117120;650117120 | 126466592;120896928;125491968;115736544;125704064 | 62997632;60219808;62522176;57632000;62634752 | |
98 | resnetv24_stage2_activation18 | Activation | [256,512,28,28] | 21668.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1091.67 | 205520896 | 66115232.00 | 132912864.00 | 99.20 | 1.03 | 188.26 | true | 0.991124;0.991724;0.991638;0.991801;0.991841 | 205520896;205520896;205520896;205520896;205520896 | 65863872;65791776;66223360;66258464;66530208 | 132419488;132277376;133110176;133208928;133749440 | |
99 | resnetv24_stage2_conv19_fwd | Convolution | [256,512,28,28] | 1554586.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2110.00 | 26358054912 | 144077109.33 | 66369376.00 | 24.90 | 125.25 | 12491.97 | false | 0.248687;0.248895;0.248603;0.248979;0.249083 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 150906624;145881440;141703840;143787936;142561952 | 69151904;66278656;66116736;66693504;66135968 | |
99 | resnetv24_stage2_conv19_fwd | Convolution | [256,512,28,28] | 1554586.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 7253.33 | 5.90 | 0.00 | 0.00 | true | 0.059361;0.059374;0.059391;0.059398;0.059961 | 0;0;0;0;0 | 7168;7424;7168;7424;6912 | 96;96;96;96;96 | |
100 | resnetv24_stage2_batchnorm19_fwd | BatchNorm | [256,128,28,28] | 7205 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.00 | 162529280 | 102762762.67 | 102727957.33 | 83.50 | 0.79 | 625.11 | true | 0.835091;0.835192;0.835252;0.835154;0.834010 | 162529280;162529280;162529280;162529280;162529280 | 102741088;102730912;102705952;102731936;102721024 | 102762720;102762720;102762720;102762848;102765152 | |
101 | resnetv24_stage2_activation19 | Activation | [256,128,28,28] | 5423.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 268.33 | 51380224 | 102760800.00 | 102535264.00 | 97.50 | 0.25 | 191.48 | true | 0.975731;0.975112;0.974773;0.975028;0.975044 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102534080;102531968;102538752;102534080;102537632 | |
102 | resnetv24_stage2_conv20_fwd | Convolution | [256,128,28,28] | 3555773.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 1171.33 | 14060093440 | 46596661.33 | 62616298.67 | 16.90 | 128.74 | 12003.50 | false | 0.169146;0.167278;0.169877;0.166613;0.170649 | 14060093440;14060093440;14060093440;14060093440;14060093440 | 42681472;45740544;46090752;48081760;47958688 | 57832032;61640896;62195168;64717792;64012832 | |
102 | resnetv24_stage2_conv20_fwd | Convolution | [256,128,28,28] | 3555773.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 358.33 | 739770368 | 103535200.00 | 142597973.33 | 47.80 | 3.01 | 2064.48 | true | 0.478568;0.479158;0.478753;0.477029;0.478167 | 739770368;739770368;739770368;739770368;739770368 | 103515232;103540128;103523296;103542176;103568480 | 142369664;142658752;142432320;142702848;142745824 | |
102 | resnetv24_stage2_conv20_fwd | Convolution | [256,128,28,28] | 3555773.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 331.67 | 704643072 | 142694752.00 | 110538261.33 | 47.40 | 2.78 | 2124.55 | true | 0.474216;0.474792;0.474317;0.473783;0.474678 | 704643072;704643072;704643072;704643072;704643072 | 142694624;142693152;142696480;142701472;142689344 | 110562656;110389024;110550304;110553472;110511008 | |
102 | resnetv24_stage2_conv20_fwd | Convolution | [256,128,28,28] | 3555773.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 106.33 | 369885184 | 595658.67 | 70804544.00 | 46.60 | 5.18 | 3478.55 | true | 0.466300;0.465622;0.466137;0.465407;0.467312 | 369885184;369885184;369885184;369885184;369885184 | 596832;594976;596384;594528;595616 | 70819008;70848320;70817600;70721856;70777024 | |
103 | resnetv24_stage2_batchnorm20_fwd | BatchNorm | [256,128,28,28] | 7281 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 261.00 | 162529280 | 102763701.33 | 103007349.33 | 83.50 | 0.79 | 622.72 | true | 0.836286;0.834753;0.835305;0.835423;0.835422 | 162529280;162529280;162529280;162529280;162529280 | 102763616;102763872;102763616;102763872;102763360 | 103005440;103011264;102991872;103013152;103005344 | |
104 | resnetv24_stage2_activation20 | Activation | [256,128,28,28] | 5438.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.33 | 51380224 | 102760800.00 | 102536000.00 | 97.50 | 0.25 | 190.77 | true | 0.974694;0.974476;0.975190;0.975075;0.974390 | 51380224;51380224;51380224;51380224;51380224 | 102541312;102529984;102547968;102529600;102536704 | 102760800;102760800;102760800;102760800;102760800 | |
105 | resnetv24_stage2_conv21_fwd | Convolution | [256,128,28,28] | 1480513.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2115.33 | 26512195584 | 141130720.00 | 280061109.33 | 24.90 | 62.95 | 12533.34 | false | 0.248688;0.248687;0.248381;0.248699;0.248695 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 139639008;141194688;146721920;140471712;141725760 | 281644000;279186144;284249888;277616640;279353184 | |
105 | resnetv24_stage2_conv21_fwd | Convolution | [256,128,28,28] | 1480513.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7338.67 | 5.90 | 0.00 | 0.00 | true | 0.059368;0.059392;0.059377;0.059358;0.059369 | 0;0;0;0;0 | 7168;7424;7424;7168;7424 | 96;96;96;96;96 | |
106 | resnetv24_stage2__plus6 | elemwise_add | [256,512,28,28] | 26399.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1475.67 | 102760448 | 203090368.00 | 210106506.67 | 96.50 | 0.25 | 69.64 | true | 0.965001;0.965092;0.964838;0.964905;0.964979 | 102760448;102760448;102760448;102760448;102760448 | 201175136;205716768;209599232;202379200;199814176 | 203408896;209136128;210149888;211033504;211408640 | |
107 | resnetv24_stage2_batchnorm21_fwd | BatchNorm | [256,512,28,28] | 28911.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1036.67 | 650117120 | 62265301.33 | 125004800.00 | 84.60 | 3.47 | 627.12 | true | 0.846111;0.846032;0.846022;0.846414;0.846229 | 650117120;650117120;650117120;650117120;650117120 | 61494976;62094592;56664864;63449248;63206336 | 123465024;124664000;113787552;127367744;126885376 | |
108 | resnetv24_stage2_activation21 | Activation | [256,512,28,28] | 21480 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1104.67 | 205520896 | 65469248.00 | 131618058.67 | 99.20 | 1.04 | 186.05 | true | 0.991340;0.991546;0.991611;0.991579;0.991610 | 205520896;205520896;205520896;205520896;205520896 | 66516000;65070688;60502976;67311264;64821056 | 133727840;130818112;121692320;135254240;130308224 | |
109 | resnetv24_stage2_conv22_fwd | Convolution | [256,512,28,28] | 1558557.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2108.33 | 26358054912 | 142713493.33 | 66579402.67 | 24.90 | 125.94 | 12501.85 | false | 0.248814;0.248801;0.249120;0.248586;0.249239 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 144064320;142402976;141673184;141371488;149037856 | 66214368;66091904;67326912;66196928;68634080 | |
109 | resnetv24_stage2_conv22_fwd | Convolution | [256,512,28,28] | 1558557.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 7168.00 | 5.90 | 0.00 | 0.00 | true | 0.059770;0.059370;0.059383;0.059398;0.059389 | 0;0;0;0;0 | 7168;7168;7168;7168;7168 | 96;96;96;96;96 | |
110 | resnetv24_stage2_batchnorm22_fwd | BatchNorm | [256,128,28,28] | 7190.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.00 | 162529280 | 102763573.33 | 102729120.00 | 83.50 | 0.79 | 625.11 | true | 0.835103;0.834862;0.835581;0.835525;0.835293 | 162529280;162529280;162529280;162529280;162529280 | 102763744;102763360;102763616;102763744;102763232 | 102719136;102706176;102734848;102744224;102733376 | |
111 | resnetv24_stage2_activation22 | Activation | [256,128,28,28] | 5455 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.67 | 51380224 | 102760800.00 | 102532096.00 | 97.50 | 0.25 | 190.53 | true | 0.975128;0.975835;0.976064;0.974840;0.974962 | 51380224;51380224;51380224;51380224;51380224 | 102537664;102525504;102532032;102527104;102537152 | 102760800;102762592;102760800;102760800;102760800 | |
112 | resnetv24_stage2_conv23_fwd | Convolution | [256,128,28,28] | 3559376.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 1171.33 | 14060093440 | 44881290.67 | 59799616.00 | 16.90 | 134.31 | 12003.50 | false | 0.170209;0.169855;0.164995;0.168415;0.170318 | 14060093440;14060093440;14060093440;14060093440;14060093440 | 48109088;39557088;47074976;41943136;45625760 | 64669760;53662752;63417408;55549184;60432256 | |
112 | resnetv24_stage2_conv23_fwd | Convolution | [256,128,28,28] | 3559376.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 357.67 | 739770368 | 103583776.00 | 142685472.00 | 47.90 | 3.00 | 2068.32 | true | 0.479258;0.478637;0.478829;0.478991;0.479113 | 739770368;739770368;739770368;739770368;739770368 | 103549856;103576736;103576544;103598048;103614688 | 142841632;142625152;142818400;142612864;142492256 | |
112 | resnetv24_stage2_conv23_fwd | Convolution | [256,128,28,28] | 3559376.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 330.33 | 704643072 | 142700192.00 | 110546784.00 | 47.50 | 2.78 | 2133.13 | true | 0.476982;0.475116;0.474911;0.475226;0.472804 | 704643072;704643072;704643072;704643072;704643072 | 142725280;142696736;142695008;142702720;142701120 | 110615488;110550240;110475840;110503776;110586336 | |
112 | resnetv24_stage2_conv23_fwd | Convolution | [256,128,28,28] | 3559376.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 107.00 | 369885184 | 595808.00 | 70813589.33 | 46.60 | 5.18 | 3456.87 | true | 0.465688;0.465349;0.469960;0.466102;0.465995 | 369885184;369885184;369885184;369885184;369885184 | 70806592;70819904;70807232;70813632;70849856 | 594784;596256;595232;596000;596192 | |
113 | resnetv24_stage2_batchnorm23_fwd | BatchNorm | [256,128,28,28] | 7274 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 261.00 | 162529280 | 102763914.67 | 103016064.00 | 83.50 | 0.79 | 622.72 | true | 0.835805;0.835377;0.835994;0.835259;0.834864 | 162529280;162529280;162529280;162529280;162529280 | 103018272;103016672;103026784;103013248;102998784 | 102763744;102763232;102764000;102764000;102765664 | |
114 | resnetv24_stage2_activation23 | Activation | [256,128,28,28] | 5435.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.67 | 51380224 | 102760800.00 | 102533888.00 | 97.50 | 0.25 | 190.53 | true | 0.975437;0.975346;0.975680;0.975725;0.974981 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102530496;102530016;102525440;102546368;102541152 | |
115 | resnetv24_stage2_conv24_fwd | Convolution | [256,128,28,28] | 1492590.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2114.33 | 26512195584 | 140898954.67 | 283450773.33 | 24.90 | 62.48 | 12539.27 | false | 0.248656;0.248690;0.248679;0.248690;0.248695 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 140199072;141138464;143087936;141359328;140137280 | 276380128;292034208;279561152;278756960;292160864 | |
115 | resnetv24_stage2_conv24_fwd | Convolution | [256,128,28,28] | 1492590.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 7253.33 | 5.90 | 0.00 | 0.00 | true | 0.059375;0.059398;0.059358;0.059369;0.059375 | 0;0;0;0;0 | 96;96;1120;96;96 | 7424;7168;9472;7168;7168 | |
116 | resnetv24_stage2__plus7 | elemwise_add | [256,512,28,28] | 26243.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1477.67 | 102760448 | 202219669.33 | 206381002.67 | 96.50 | 0.25 | 69.54 | true | 0.964716;0.964760;0.964988;0.964860;0.964842 | 102760448;102760448;102760448;102760448;102760448 | 202869600;204470496;201187040;202602368;199857440 | 204511008;203718848;210913152;211459936;199119008 | |
117 | resnetv24_stage3_batchnorm0_fwd | BatchNorm | [256,512,28,28] | 28915 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1036.00 | 650117120 | 62432746.67 | 125338912.00 | 84.60 | 3.46 | 627.53 | true | 0.846346;0.846219;0.846502;0.846297;0.846192 | 650117120;650117120;650117120;650117120;650117120 | 62207648;62264672;62825920;63101536;51776704 | 124876480;125000320;126139936;126671616;104016064 | |
118 | resnetv24_stage3_activation0 | Activation | [256,512,28,28] | 21643.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1094.67 | 205520896 | 64542624.00 | 129783392.00 | 99.10 | 1.06 | 187.75 | true | 0.991297;0.991205;0.991376;0.991656;0.991294 | 205520896;205520896;205520896;205520896;205520896 | 62596480;63854336;57134560;67550368;67177056 | 125918656;128391200;114973120;135764320;135040320 | |
119 | resnetv24_stage3_conv0_fwd | Convolution | [256,512,28,28] | 2876998.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 4164.67 | 52716109824 | 142706389.33 | 87057920.00 | 17.60 | 229.44 | 12657.94 | false | 0.176093;0.173871;0.175714;0.175413;0.176029 | 52716109824;52716109824;52716109824;52716109824;52716109824 | 146847392;142915616;141560992;141278912;143642560 | 87253056;87112256;87046432;87015072;86994848 | |
119 | resnetv24_stage3_conv0_fwd | Convolution | [256,512,28,28] | 2876998.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 7253.33 | 5.90 | 0.00 | 0.00 | true | 0.059357;0.059367;0.059368;0.059382;0.059359 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7168;7424;7168;7424 | |
120 | resnetv24_stage3_batchnorm1_fwd | BatchNorm | [256,256,28,28] | 14274 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 518.00 | 325058560 | 94201802.67 | 94126549.33 | 84.10 | 1.73 | 627.53 | true | 0.841026;0.841651;0.841129;0.841599;0.842106 | 325058560;325058560;325058560;325058560;325058560 | 86646592;99490464;99480832;96252224;86601664 | 86706624;99554784;99554592;96343840;86706976 | |
121 | resnetv24_stage3_activation1 | Activation | [256,256,28,28] | 10907.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.33 | 102760448 | 71718549.33 | 71638058.67 | 98.70 | 0.72 | 190.18 | true | 0.987175;0.987766;0.987419;0.987918;0.987313 | 102760448;102760448;102760448;102760448;102760448 | 77070752;67436960;70647936;64225696;93127008 | 76986944;67360896;70566336;64154112;93026080 | |
122 | resnetv24_stage3_conv1_fwd | Convolution | [256,256,28,28] | 3415246.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_small_nn_v1 | 4678.00 | 59215708160 | 69098794.67 | 20239242.67 | 14.60 | 662.83 | 12658.34 | false | 0.147190;0.146426;0.145141;0.146505;0.146273 | 59215708160;59215708160;59215708160;59215708160;59215708160 | 69482784;69606336;70967456;68207264;66911904 | 20242528;20266144;20227584;20213216;20247616 | |
122 | resnetv24_stage3_conv1_fwd | Convolution | [256,256,28,28] | 3415246.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057903;0.058072;0.057949;0.058092;0.057899 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2688;2432;2688 | |
123 | resnetv24_stage3_batchnorm2_fwd | BatchNorm | [256,256,14,14] | 6266 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 142.00 | 81264640 | 51976661.33 | 55379221.33 | 88.00 | 0.76 | 572.29 | true | 0.880641;0.882441;0.880391;0.879096;0.877956 | 81264640;81264640;81264640;81264640;81264640 | 51974240;51971200;51974656;51981088;51991040 | 55408544;55389440;55328672;55339680;55599776 | |
124 | resnetv24_stage3_activation2 | Activation | [256,256,14,14] | 2791.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380832.00 | 51372458.67 | 95.80 | 0.25 | 188.44 | true | 0.958229;0.958376;0.958301;0.957625;0.959222 | 25690112;25690112;25690112;25690112;25690112 | 51380832;51380832;51380832;51380832;51380832 | 51374400;51368736;51371168;51377088;51371808 | |
125 | resnetv24_stage3_conv2_fwd | Convolution | [256,256,14,14] | 1414332.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2080.33 | 26409435136 | 156598890.67 | 151953290.67 | 24.90 | 85.59 | 12694.81 | false | 0.249007;0.249031;0.248990;0.249003;0.248832 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 155998944;162739840;154791808;152130976;159005920 | 151187648;153379680;151020320;151778880;152893344 | |
125 | resnetv24_stage3_conv2_fwd | Convolution | [256,256,14,14] | 1414332.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 352.00 | 2773.33 | 5.80 | 0.00 | 0.00 | true | 0.057915;0.058063;0.058356;0.058104;0.057896 | 0;0;0;0;0 | 2688;2688;3072;2688;2944 | 352;352;352;352;352 | |
126 | resnetv24_stage3_conv3_fwd | Convolution | [256,512,28,28] | 2858087.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x128_relu_interior_nn_v1 | 4265.00 | 52716109824 | 205717888.00 | 93560256.00 | 17.10 | 176.14 | 12360.17 | false | 0.172722;0.173627;0.170111;0.168963;0.169509 | 52716109824;52716109824;52716109824;52716109824;52716109824 | 221171808;203142816;205082272;208928576;201613152 | 98698784;95879552;90694976;92536896;92264320 | |
126 | resnetv24_stage3_conv3_fwd | Convolution | [256,512,28,28] | 2858087.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 1514.67 | 5.80 | 0.00 | 0.00 | true | 0.057863;0.058002;0.057887;0.058048;0.057860 | 0;0;0;0;0 | 704;992;2080;1472;2656 | 96;96;2144;96;96 | |
127 | resnetv24_stage3__plus0 | elemwise_add | [256,1024,14,14] | 13142.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 4117344.00 | 4708725.33 | 95.70 | 5.82 | 69.34 | true | 0.956959;0.957285;0.956956;0.956965;0.957062 | 51380224;51380224;51380224;51380224;51380224 | 6155168;3782304;4188704;6299520;256 | 5561216;3182880;3607936;5748960;128 | |
128 | resnetv24_stage3_batchnorm3_fwd | BatchNorm | [256,1024,14,14] | 25299.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 59372202.67 | 66289141.33 | 90.00 | 2.59 | 586.40 | true | 0.900306;0.900102;0.900246;0.900408;0.900263 | 325058560;325058560;325058560;325058560;325058560 | 48576992;80946528;61528992;48578272;68009344 | 52334720;87218496;66285600;55820224;76761600 | |
129 | resnetv24_stage3_activation3 | Activation | [256,1024,14,14] | 10815.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 547.00 | 102760448 | 76001600.00 | 74926293.33 | 98.70 | 0.68 | 187.86 | true | 0.987065;0.987292;0.987056;0.987482;0.986986 | 102760448;102760448;102760448;102760448;102760448 | 77066880;105968640;64236128;70645440;77066560 | 77074560;105972064;64226976;70648224;80282016 | |
130 | resnetv24_stage3_conv4_fwd | Convolution | [256,1024,14,14] | 1482997.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.00 | 26332364800 | 156798784.00 | 37786058.67 | 24.80 | 135.33 | 12569.15 | false | 0.247861;0.247461;0.248176;0.246959;0.247597 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 37804384;37424256;37784608;37785792;37787776 | 155693184;167606592;160204032;153244704;154499136 | |
130 | resnetv24_stage3_conv4_fwd | Convolution | [256,1024,14,14] | 1482997.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2688.00 | 5.80 | 0.00 | 0.00 | true | 0.057905;0.058063;0.057897;0.058065;0.057931 | 0;0;0;0;0 | 96;96;352;96;96 | 2688;2432;2944;2432;3200 | |
131 | resnetv24_stage3_batchnorm4_fwd | BatchNorm | [256,256,14,14] | 6357.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52052128.00 | 55772192.00 | 88.30 | 0.75 | 566.96 | true | 0.881978;0.883011;0.883506;0.882752;0.881625 | 81264640;81264640;81264640;81264640;81264640 | 52056448;52031392;52049184;52050752;52058944 | 55766336;55820320;55767872;55782368;55742016 | |
132 | resnetv24_stage3_activation4 | Activation | [256,256,14,14] | 2779 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51369098.67 | 95.70 | 0.25 | 187.98 | true | 0.957016;0.956975;0.956396;0.956981;0.956305 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51385952 | 51363008;51364864;51372640;51369792;51381792 | |
133 | resnetv24_stage3_conv5_fwd | Convolution | [256,256,14,14] | 3412905.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1502.67 | 19365101568 | 81923093.33 | 77171616.00 | 24.70 | 121.72 | 12887.15 | false | 0.246805;0.246587;0.246504;0.246750;0.246682 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 86201632;79057152;80510496;75622848;88839808 | 77892672;71056096;76758016;77179584;77577248 | |
133 | resnetv24_stage3_conv5_fwd | Convolution | [256,256,14,14] | 3412905.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.00 | 251658240 | 51361802.67 | 150585344.00 | 47.20 | 1.25 | 911.81 | true | 0.471158;0.471406;0.471936;0.473162;0.471775 | 251658240;251658240;251658240;251658240;251658240 | 51379744;51355040;51346592;51355872;51374496 | 150595424;150614848;150612704;150547904;150544096 | |
133 | resnetv24_stage3_conv5_fwd | Convolution | [256,256,14,14] | 3412905.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 264.00 | 269484032 | 151381013.33 | 54324757.33 | 48.30 | 1.31 | 1020.77 | true | 0.482646;0.482302;0.482750;0.482854;0.482913 | 269484032;269484032;269484032;269484032;269484032 | 151381408;151407776;151376800;151382496;151379136 | 54323328;54358496;54343136;54302624;54307808 | |
133 | resnetv24_stage3_conv5_fwd | Convolution | [256,256,14,14] | 3412905.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2365184.00 | 9313845.33 | 32.50 | 0.45 | 312.26 | true | 0.323203;0.328701;0.327714;0.319316;0.323964 | 5308416;5308416;5308416;5308416;5308416 | 2365184;2365184;2370560;2365184;2365184 | 9313536;9292800;9307776;9320224;9353984 | |
134 | resnetv24_stage3_batchnorm5_fwd | BatchNorm | [256,256,14,14] | 6291.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.33 | 81264640 | 51794506.67 | 55881013.33 | 88.20 | 0.75 | 563.04 | true | 0.881266;0.881863;0.882684;0.881374;0.881823 | 81264640;81264640;81264640;81264640;81264640 | 51806208;51780608;51795904;51809344;51781408 | 55908992;55896256;55859040;55799776;55887744 | |
135 | resnetv24_stage3_activation5 | Activation | [256,256,14,14] | 2777 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51370997.33 | 95.80 | 0.25 | 188.44 | true | 0.958519;0.959422;0.956232;0.956593;0.957903 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380608;51380576 | 51366112;51369760;51370912;51375616;51372320 | |
136 | resnetv24_stage3_conv6_fwd | Convolution | [256,256,14,14] | 1426822.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2081.33 | 26409435136 | 154938421.33 | 151234976.00 | 24.90 | 86.26 | 12688.71 | false | 0.249003;0.248946;0.248962;0.249029;0.248954 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 156865312;155692256;155012192;154110816;153298080 | 153973344;152053600;151032160;150619168;150503232 | |
136 | resnetv24_stage3_conv6_fwd | Convolution | [256,256,14,14] | 1426822.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 352.00 | 2730.67 | 5.80 | 0.00 | 0.00 | true | 0.057907;0.058085;0.057904;0.058080;0.057926 | 0;0;0;0;0 | 352;352;352;352;352 | 2816;2688;2816;2688;2688 | |
137 | resnetv24_stage3__plus1 | elemwise_add | [256,1024,14,14] | 13194.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 4882549.33 | 5469034.67 | 95.60 | 4.96 | 69.34 | true | 0.956522;0.956375;0.956739;0.956439;0.956456 | 51380224;51380224;51380224;51380224;51380224 | 6720672;5125760;5168064;6113280;704 | 6099040;4528832;4590944;5527872;128 | |
138 | resnetv24_stage3_batchnorm6_fwd | BatchNorm | [256,1024,14,14] | 24770.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 57157888.00 | 63954400.00 | 90.00 | 2.68 | 585.69 | true | 0.900294;0.900177;0.900572;0.900180;0.900476 | 325058560;325058560;325058560;325058560;325058560 | 48529920;61471232;77649344;61471584;48530848 | 55806880;69769152;83723040;66275616;55818432 | |
139 | resnetv24_stage3_activation6 | Activation | [256,1024,14,14] | 10872.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.33 | 102760448 | 71718549.33 | 72782666.67 | 98.70 | 0.71 | 189.83 | true | 0.987215;0.987334;0.987741;0.987396;0.986900 | 102760448;102760448;102760448;102760448;102760448 | 67436960;67436960;77070464;83493280;70648224 | 70639712;67434592;77065664;83494336;70642624 | |
140 | resnetv24_stage3_conv7_fwd | Convolution | [256,1024,14,14] | 1477130.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2093.33 | 26332364800 | 159772672.00 | 37721408.00 | 24.80 | 133.33 | 12579.16 | false | 0.247791;0.247602;0.247322;0.247894;0.247452 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 37724384;37745088;37791808;37694752;37672480 | 158886016;156180576;163093920;164163008;157338080 | |
140 | resnetv24_stage3_conv7_fwd | Convolution | [256,1024,14,14] | 1477130.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 3157.33 | 5.80 | 0.00 | 0.00 | true | 0.058066;0.058070;0.058066;0.058078;0.057874 | 0;0;0;0;0 | 96;96;96;2144;96 | 2432;4480;2560;10624;2432 | |
141 | resnetv24_stage3_batchnorm7_fwd | BatchNorm | [256,256,14,14] | 6284 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52040160.00 | 55745450.67 | 88.30 | 0.75 | 568.28 | true | 0.882595;0.881741;0.883316;0.882418;0.883226 | 81264640;81264640;81264640;81264640;81264640 | 55730848;55760992;55739104;55751776;55745472 | 52058816;52034848;52035840;52038144;52046496 | |
142 | resnetv24_stage3_activation7 | Activation | [256,256,14,14] | 2763.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 135.33 | 25690112 | 51380586.67 | 51373685.33 | 95.60 | 0.25 | 189.83 | true | 0.955659;0.956566;0.956973;0.955388;0.958331 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380608;51380576;51380576;51380576 | 51375136;51369664;51377056;51376256;51365152 | |
143 | resnetv24_stage3_conv8_fwd | Convolution | [256,256,14,14] | 3405948.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.67 | 19365101568 | 89723786.67 | 76742762.67 | 24.70 | 116.33 | 12921.55 | false | 0.246669;0.246666;0.246420;0.247140;0.246805 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 94931680;86116576;87976288;89377728;91817344 | 76854176;77284608;75239008;76089504;78835104 | |
143 | resnetv24_stage3_conv8_fwd | Convolution | [256,256,14,14] | 3405948.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.00 | 251658240 | 51401248.00 | 150586869.33 | 47.10 | 1.25 | 911.81 | true | 0.470767;0.471682;0.470052;0.470094;0.471046 | 251658240;251658240;251658240;251658240;251658240 | 51415008;51368032;51410016;51399776;51393952 | 150566336;150583776;150658656;150543552;150610496 | |
143 | resnetv24_stage3_conv8_fwd | Convolution | [256,256,14,14] | 3405948.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 264.33 | 269484032 | 151321546.67 | 54189696.00 | 48.30 | 1.31 | 1019.49 | true | 0.482833;0.482441;0.482653;0.482484;0.482496 | 269484032;269484032;269484032;269484032;269484032 | 151320928;151321472;151340544;151305664;151322240 | 54189888;54090144;54241952;54193184;54186016 | |
143 | resnetv24_stage3_conv8_fwd | Convolution | [256,256,14,14] | 3405948.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9337152.00 | 30.80 | 0.45 | 318.50 | true | 0.301611;0.308787;0.313302;0.313774;0.301117 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9337632;9366400;9277312;9374496;9307424 | |
144 | resnetv24_stage3_batchnorm8_fwd | BatchNorm | [256,256,14,14] | 6258 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51774474.67 | 55935989.33 | 88.20 | 0.75 | 568.28 | true | 0.882235;0.882884;0.881173;0.882202;0.882356 | 81264640;81264640;81264640;81264640;81264640 | 51760768;51772864;51775296;51784864;51775264 | 55912384;55922208;55949824;55955360;55935936 | |
145 | resnetv24_stage3_activation8 | Activation | [256,256,14,14] | 2772.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51370464.00 | 95.60 | 0.25 | 188.44 | true | 0.955585;0.954977;0.958490;0.957346;0.955537 | 25690112;25690112;25690112;25690112;25690112 | 51371552;51375872;51363744;51366176;51373664 | 51380608;51380576;51380576;51380576;51380576 | |
146 | resnetv24_stage3_conv9_fwd | Convolution | [256,256,14,14] | 1418565 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2084.00 | 26409435136 | 155589568.00 | 151976426.67 | 24.90 | 85.87 | 12672.47 | false | 0.249022;0.248958;0.249003;0.248966;0.248941 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 153874816;159087008;155222048;154235200;157311456 | 150661312;153724096;151543872;149977600;154403520 | |
146 | resnetv24_stage3_conv9_fwd | Convolution | [256,256,14,14] | 1418565 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057899;0.058587;0.057942;0.058094;0.057911 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2688;2432;2688 | |
147 | resnetv24_stage3__plus2 | elemwise_add | [256,1024,14,14] | 13213 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 3854954.67 | 4445280.00 | 95.70 | 6.19 | 69.34 | true | 0.956795;0.956963;0.957179;0.957428;0.956848 | 51380224;51380224;51380224;51380224;51380224 | 5764000;128;2459040;3341824;5935168 | 6357664;576;3038784;3939392;6457728 | |
148 | resnetv24_stage3_batchnorm9_fwd | BatchNorm | [256,1024,14,14] | 25152.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 51769397.33 | 58134944.00 | 90.00 | 2.96 | 585.69 | true | 0.900518;0.900078;0.900371;0.900429;0.900259 | 325058560;325058560;325058560;325058560;325058560 | 62782720;83691232;55806048;52318304;55816064 | 58235584;77644512;48533152;48539424;48533184 | |
149 | resnetv24_stage3_activation9 | Activation | [256,1024,14,14] | 10795.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 543.67 | 102760448 | 69577802.67 | 68506912.00 | 98.70 | 0.74 | 189.01 | true | 0.987693;0.987067;0.987433;0.987710;0.987098 | 102760448;102760448;102760448;102760448;102760448 | 61014400;93127008;73859488;64225696;70648224 | 61008128;96334720;70643776;64224384;70652576 | |
150 | resnetv24_stage3_conv10_fwd | Convolution | [256,1024,14,14] | 1483547.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2094.33 | 26332364800 | 160102666.67 | 37787285.33 | 24.80 | 133.07 | 12573.15 | false | 0.247772;0.247386;0.247209;0.248083;0.247359 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 37791904;37757984;37813056;37784960;37784992 | 166088064;157657088;161071328;155459104;161579584 | |
150 | resnetv24_stage3_conv10_fwd | Convolution | [256,1024,14,14] | 1483547.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057921;0.058131;0.057941;0.058065;0.057912 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2432 | |
151 | resnetv24_stage3_batchnorm10_fwd | BatchNorm | [256,256,14,14] | 6272 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52049504.00 | 55782986.67 | 88.20 | 0.75 | 568.28 | true | 0.882263;0.882597;0.882365;0.882472;0.882972 | 81264640;81264640;81264640;81264640;81264640 | 52036352;52056896;52055264;52032160;52079040 | 55834656;55778624;55761472;55808864;55664576 | |
152 | resnetv24_stage3_activation10 | Activation | [256,256,14,14] | 2753.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51367829.33 | 95.80 | 0.25 | 188.90 | true | 0.956416;0.958296;0.957747;0.958038;0.957126 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51371264;51364448;51370368;51368672;51361120 | |
153 | resnetv24_stage3_conv11_fwd | Convolution | [256,256,14,14] | 3417490.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.00 | 19365101568 | 87671008.00 | 76396938.67 | 24.70 | 118.03 | 12935.94 | false | 0.246640;0.246749;0.246583;0.247042;0.246559 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 87359264;79912128;86655456;89300288;88998304 | 74478304;77500352;77212160;72551936;77581792 | |
153 | resnetv24_stage3_conv11_fwd | Convolution | [256,256,14,14] | 3417490.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.00 | 251658240 | 51410016.00 | 150577450.67 | 47.10 | 1.25 | 911.81 | true | 0.470809;0.471757;0.471000;0.471208;0.470041 | 251658240;251658240;251658240;251658240;251658240 | 51411488;51416992;51371808;51413792;51404768 | 150537280;150617024;150539968;150593632;150598752 | |
153 | resnetv24_stage3_conv11_fwd | Convolution | [256,256,14,14] | 3417490.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.67 | 269484032 | 151343893.33 | 54265440.00 | 48.20 | 1.31 | 1029.87 | true | 0.482112;0.483169;0.482288;0.482625;0.482199 | 269484032;269484032;269484032;269484032;269484032 | 151353056;151362848;151303040;151336288;151342336 | 54274848;54183968;54392672;54292256;54229216 | |
153 | resnetv24_stage3_conv11_fwd | Convolution | [256,256,14,14] | 3417490.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.00 | 5308416 | 2359552.00 | 9348448.00 | 30.50 | 0.45 | 331.78 | true | 0.305915;0.301135;0.304477;0.311030;0.305123 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9345696;9350144;9388064;9349504;9319712 | |
154 | resnetv24_stage3_batchnorm11_fwd | BatchNorm | [256,256,14,14] | 6307.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 51770016.00 | 55912448.00 | 88.20 | 0.75 | 564.34 | true | 0.881715;0.883168;0.882006;0.881520;0.881421 | 81264640;81264640;81264640;81264640;81264640 | 55935232;55909376;55896064;55912384;55915584 | 51786144;51783360;51761184;51762912;51763776 | |
155 | resnetv24_stage3_activation11 | Activation | [256,256,14,14] | 2776.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51370517.33 | 95.70 | 0.25 | 188.90 | true | 0.956214;0.957543;0.958714;0.956547;0.955892 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51365248;51376448;51370880;51370080;51370592 | |
156 | resnetv24_stage3_conv12_fwd | Convolution | [256,256,14,14] | 1427172.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2082.67 | 26409435136 | 154358528.00 | 152118944.00 | 24.90 | 86.17 | 12680.58 | false | 0.248952;0.249067;0.248956;0.248955;0.248897 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 154297248;152671552;157395872;154978816;153799520 | 150521184;150708704;154660160;152763712;152884416 | |
156 | resnetv24_stage3_conv12_fwd | Convolution | [256,256,14,14] | 1427172.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057912;0.058080;0.058398;0.058048;0.057989 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2432;2432;2432 | |
157 | resnetv24_stage3__plus3 | elemwise_add | [256,1024,14,14] | 13224 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.67 | 51380224 | 128.00 | 565.33 | 95.60 | 74106.13 | 69.37 | false | 0.956178;0.956353;0.956447;0.956510;0.956466 | 51380224;51380224;51380224;51380224;51380224 | 128;160;128;128;128 | 640;544;512;512;704 | |
158 | resnetv24_stage3_batchnorm12_fwd | BatchNorm | [256,1024,14,14] | 25052.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 55055829.33 | 60483413.33 | 90.00 | 2.81 | 585.69 | true | 0.900383;0.900341;0.900351;0.900319;0.900631 | 325058560;325058560;325058560;325058560;325058560 | 48587328;64766016;48584096;51814144;77724480 | 55816256;69792000;55827552;55830688;83736320 | |
159 | resnetv24_stage3_activation12 | Activation | [256,1024,14,14] | 10829.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.00 | 102760448 | 73859434.67 | 74925312.00 | 98.70 | 0.69 | 189.59 | true | 0.987215;0.986945;0.987524;0.986908;0.987044 | 102760448;102760448;102760448;102760448;102760448 | 77069856;67430144;77064256;70641824;80277888 | 77070880;67436896;77070464;67436960;80281952 | |
160 | resnetv24_stage3_conv13_fwd | Convolution | [256,1024,14,14] | 1478569.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2097.00 | 26332364800 | 156016021.33 | 37731573.33 | 24.70 | 135.91 | 12557.16 | false | 0.246817;0.247186;0.246582;0.248211;0.248074 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 155902144;156873952;154668960;155271968;163142368 | 37738016;37766272;37712704;37734400;37722304 | |
160 | resnetv24_stage3_conv13_fwd | Convolution | [256,1024,14,14] | 1478569.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058363;0.058048;0.057951;0.058089;0.057906 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2688;2432;2688 | |
161 | resnetv24_stage3_batchnorm13_fwd | BatchNorm | [256,256,14,14] | 6323.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52049994.67 | 55746101.33 | 88.20 | 0.75 | 568.28 | true | 0.881559;0.881970;0.883211;0.882244;0.883698 | 81264640;81264640;81264640;81264640;81264640 | 52065952;52058496;52043808;52041632;52047680 | 55690176;55734080;55739744;55764480;55793696 | |
162 | resnetv24_stage3_activation13 | Activation | [256,256,14,14] | 2761.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51372938.67 | 95.70 | 0.25 | 188.44 | true | 0.956547;0.955922;0.957516;0.956613;0.956845 | 25690112;25690112;25690112;25690112;25690112 | 51370496;51373088;51376032;51364960;51375232 | 51380576;51380576;51380576;51380576;51380576 | |
163 | resnetv24_stage3_conv14_fwd | Convolution | [256,256,14,14] | 3415233.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.33 | 19365101568 | 82788906.67 | 77360042.67 | 24.70 | 120.92 | 12924.43 | false | 0.246685;0.246927;0.246883;0.246799;0.246959 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 77830848;77109920;77195968;77260288;77623872 | 81175200;83659552;84482400;83246240;81460928 | |
163 | resnetv24_stage3_conv14_fwd | Convolution | [256,256,14,14] | 3415233.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.00 | 251658240 | 51401141.33 | 150593141.33 | 47.10 | 1.25 | 911.81 | true | 0.470681;0.469945;0.471387;0.470549;0.471172 | 251658240;251658240;251658240;251658240;251658240 | 150587328;150583776;150608320;150575296;150629728 | 51421408;51388000;51416928;51372576;51398496 | |
163 | resnetv24_stage3_conv14_fwd | Convolution | [256,256,14,14] | 3415233.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.67 | 269484032 | 151329504.00 | 54170261.33 | 48.20 | 1.31 | 1022.06 | true | 0.482479;0.481834;0.482583;0.482610;0.482322 | 269484032;269484032;269484032;269484032;269484032 | 151313152;151343136;151351328;151318720;151326656 | 54171008;54235744;54198432;54141344;54070368 | |
163 | resnetv24_stage3_conv14_fwd | Convolution | [256,256,14,14] | 3415233.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9356725.33 | 30.50 | 0.45 | 325.01 | true | 0.305681;0.303438;0.305190;0.309503;0.303247 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9340928;9366144;9363104;9387264;9326912 | |
164 | resnetv24_stage3_batchnorm14_fwd | BatchNorm | [256,256,14,14] | 6281.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 51780864.00 | 55927082.67 | 88.20 | 0.75 | 566.96 | true | 0.881537;0.882467;0.881667;0.881296;0.881424 | 81264640;81264640;81264640;81264640;81264640 | 51778656;51789088;51792800;51774848;51765824 | 55939168;55921984;55920096;55913184;55944544 | |
165 | resnetv24_stage3_activation14 | Activation | [256,256,14,14] | 2751.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51371434.67 | 95.70 | 0.25 | 188.90 | true | 0.957024;0.956213;0.957457;0.957152;0.955741 | 25690112;25690112;25690112;25690112;25690112 | 51370752;51376288;51372864;51370688;51363808 | 51380576;51380576;51380576;51380576;51380576 | |
166 | resnetv24_stage3_conv15_fwd | Convolution | [256,256,14,14] | 1416228.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2082.67 | 26409435136 | 158454538.67 | 153179701.33 | 24.90 | 84.74 | 12680.58 | false | 0.248951;0.249023;0.249069;0.248916;0.248859 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 159936800;162015424;154702464;160724352;153019232 | 153946528;154512416;151214336;154378240;149876960 | |
166 | resnetv24_stage3_conv15_fwd | Convolution | [256,256,14,14] | 1416228.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2944.00 | 5.80 | 0.00 | 0.00 | true | 0.057878;0.058068;0.057917;0.058109;0.057896 | 0;0;0;0;0 | 96;96;96;96;96 | 3456;2688;2432;2688;7808 | |
167 | resnetv24_stage3__plus4 | elemwise_add | [256,1024,14,14] | 13251.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 331157.33 | 515765.33 | 95.70 | 60.67 | 69.34 | false | 0.956826;0.957136;0.956967;0.957143;0.957180 | 51380224;51380224;51380224;51380224;51380224 | 5222560;128;128;993184;160 | 5797472;992;1696;1544608;800 | |
168 | resnetv24_stage3_batchnorm15_fwd | BatchNorm | [256,1024,14,14] | 25122.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 56087061.33 | 60454698.67 | 90.10 | 2.79 | 585.69 | true | 0.900557;0.900906;0.900376;0.900456;0.900749 | 325058560;325058560;325058560;325058560;325058560 | 48523712;61481728;58237408;71191680;48542048 | 52304480;66275456;62774496;76733280;52314144 | |
169 | resnetv24_stage3_activation15 | Activation | [256,1024,14,14] | 10788.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.67 | 102760448 | 69578538.67 | 68503402.67 | 98.70 | 0.74 | 189.36 | true | 0.987184;0.986780;0.987518;0.987093;0.987103 | 102760448;102760448;102760448;102760448;102760448 | 73859200;64228256;61014144;73859488;70648160 | 70642784;64222944;61011456;70644480;70645248 | |
170 | resnetv24_stage3_conv16_fwd | Convolution | [256,1024,14,14] | 1481429.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2094.33 | 26332364800 | 159244821.33 | 37771584.00 | 24.70 | 133.66 | 12573.15 | false | 0.247317;0.246880;0.247232;0.247302;0.247858 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 158692800;159109984;160055904;157610816;159931680 | 37751648;37769120;37753248;37856512;37792384 | |
170 | resnetv24_stage3_conv16_fwd | Convolution | [256,1024,14,14] | 1481429.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 3114.67 | 5.80 | 0.00 | 0.00 | true | 0.057912;0.058105;0.057913;0.058048;0.057947 | 0;0;0;0;0 | 4480;2432;2432;2432;7552 | 1376;96;96;96;96 | |
171 | resnetv24_stage3_batchnorm16_fwd | BatchNorm | [256,256,14,14] | 6258.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52042944.00 | 55789098.67 | 88.30 | 0.75 | 568.28 | true | 0.882469;0.882571;0.882845;0.882927;0.883189 | 81264640;81264640;81264640;81264640;81264640 | 52035392;52041280;52040352;52047200;52058176 | 55764224;55794464;55808608;55845376;55693952 | |
172 | resnetv24_stage3_activation16 | Activation | [256,256,14,14] | 2769.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51373365.33 | 95.80 | 0.25 | 188.44 | true | 0.956764;0.958996;0.958659;0.957788;0.957438 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51394784;51380576;51380576;51380576 | 51368640;51371936;51374080;51374080;51377408 | |
173 | resnetv24_stage3_conv17_fwd | Convolution | [256,256,14,14] | 3410553.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1511.67 | 19365101568 | 87859477.33 | 77903381.33 | 24.70 | 116.82 | 12810.43 | false | 0.246683;0.247024;0.246895;0.247087;0.247015 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 82840768;87287264;88977152;88282976;88008192 | 74612640;78311456;78282720;77115968;78996640 | |
173 | resnetv24_stage3_conv17_fwd | Convolution | [256,256,14,14] | 3410553.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.00 | 251658240 | 51391456.00 | 150574240.00 | 47.10 | 1.25 | 911.81 | true | 0.470411;0.470978;0.471386;0.470930;0.471530 | 251658240;251658240;251658240;251658240;251658240 | 51390240;51382368;51401760;51381344;51414816 | 150580704;150543936;150584768;150557248;150616672 | |
173 | resnetv24_stage3_conv17_fwd | Convolution | [256,256,14,14] | 3410553.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.67 | 269484032 | 151341504.00 | 54254528.00 | 48.20 | 1.31 | 1033.82 | true | 0.481828;0.482272;0.482071;0.482241;0.481794 | 269484032;269484032;269484032;269484032;269484032 | 151343840;151337600;151344448;151334752;151343072 | 54233888;54122272;54256032;54300992;54273664 | |
173 | resnetv24_stage3_conv17_fwd | Convolution | [256,256,14,14] | 3410553.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.00 | 5308416 | 2359552.00 | 9336725.33 | 30.20 | 0.45 | 331.78 | true | 0.300594;0.304900;0.300906;0.302177;0.302075 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9329408;9392384;9308832;9371936;9285920 | |
174 | resnetv24_stage3_batchnorm17_fwd | BatchNorm | [256,256,14,14] | 6313.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 51779114.67 | 55910069.33 | 88.20 | 0.75 | 564.34 | true | 0.882013;0.882185;0.881970;0.880395;0.882048 | 81264640;81264640;81264640;81264640;81264640 | 51789760;51792640;51768320;51774400;51773184 | 55937440;55906048;55910752;55913408;55902624 | |
175 | resnetv24_stage3_activation17 | Activation | [256,256,14,14] | 2751.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51369984.00 | 95.70 | 0.25 | 188.44 | true | 0.956445;0.957356;0.956239;0.955697;0.958466 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380608;51380576;51380576 | 51365792;51362400;51373280;51371552;51372608 | |
176 | resnetv24_stage3_conv18_fwd | Convolution | [256,256,14,14] | 1425938.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2081.00 | 26409435136 | 155968960.00 | 152305141.33 | 24.90 | 85.67 | 12690.74 | false | 0.248997;0.249037;0.248959;0.249059;0.248586 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 151689440;152100704;153125280;149957088;155812416 | 156419232;156877440;154610208;152269664;161531968 | |
176 | resnetv24_stage3_conv18_fwd | Convolution | [256,256,14,14] | 1425938.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057952;0.058109;0.057883;0.058063;0.057926 | 0;0;0;0;0 | 2432;2432;2176;2432;2432 | 96;96;96;96;96 | |
177 | resnetv24_stage3__plus5 | elemwise_add | [256,1024,14,14] | 13169 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.33 | 51380224 | 4101322.67 | 4699605.33 | 95.60 | 5.84 | 69.40 | true | 0.956627;0.956277;0.956673;0.956426;0.956022 | 51380224;51380224;51380224;51380224;51380224 | 6717504;2256224;2518208;5890304;5690304 | 6118080;1649888;1884544;5299744;5119680 | |
178 | resnetv24_stage3_batchnorm18_fwd | BatchNorm | [256,1024,14,14] | 24734.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 553.67 | 325058560 | 60449525.33 | 62810752.00 | 90.00 | 2.64 | 587.10 | true | 0.900195;0.900461;0.900765;0.900456;0.900554 | 325058560;325058560;325058560;325058560;325058560 | 58293344;51810048;55052608;80970208;68002624 | 62809600;55838784;55828704;87235104;69783872 | |
179 | resnetv24_stage3_activation18 | Activation | [256,1024,14,14] | 10786.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 544.00 | 102760448 | 69577941.33 | 70645472.00 | 98.70 | 0.73 | 188.90 | true | 0.986922;0.987055;0.987608;0.987197;0.987343 | 102760448;102760448;102760448;102760448;102760448 | 89915808;80281952;61014144;61014432;67437440 | 89911456;80276736;64225216;61011616;67434464 | |
180 | resnetv24_stage3_conv19_fwd | Convolution | [256,1024,14,14] | 1477194.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.67 | 26332364800 | 157884192.00 | 37718965.33 | 24.80 | 134.62 | 12559.15 | false | 0.247162;0.248291;0.248332;0.247779;0.247567 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 167834080;158906656;156972320;157773600;155994368 | 33690304;37705568;37845056;37772384;37678944 | |
180 | resnetv24_stage3_conv19_fwd | Convolution | [256,1024,14,14] | 1477194.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057910;0.058049;0.058031;0.058085;0.057899 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2432;2688;2432 | |
181 | resnetv24_stage3_batchnorm19_fwd | BatchNorm | [256,256,14,14] | 6264.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52045589.33 | 55742282.67 | 88.30 | 0.75 | 568.28 | true | 0.883052;0.883095;0.882067;0.881077;0.883068 | 81264640;81264640;81264640;81264640;81264640 | 52057600;52032800;52024768;52046880;52057088 | 55675040;55774048;55826272;55773632;55679168 | |
182 | resnetv24_stage3_activation19 | Activation | [256,256,14,14] | 2745 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380586.67 | 51366496.00 | 95.70 | 0.25 | 188.90 | true | 0.957201;0.956569;0.956563;0.956087;0.957171 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380608;51382368 | 51369824;51368768;51365280;51365440;51364832 | |
183 | resnetv24_stage3_conv20_fwd | Convolution | [256,256,14,14] | 3405101.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.67 | 19365101568 | 86837408.00 | 77994805.33 | 24.70 | 117.48 | 12921.55 | false | 0.247132;0.246666;0.246682;0.246864;0.246668 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 86118784;87158528;87234912;78632064;96785792 | 75537888;78324256;77567200;78092960;78646688 | |
183 | resnetv24_stage3_conv20_fwd | Convolution | [256,256,14,14] | 3405101.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.00 | 251658240 | 51399968.00 | 150583914.67 | 47.10 | 1.25 | 915.12 | true | 0.470807;0.471166;0.471279;0.470976;0.471128 | 251658240;251658240;251658240;251658240;251658240 | 51397600;51412832;51388000;51389472;51420960 | 150586176;150562752;150602816;150686944;150559840 | |
183 | resnetv24_stage3_conv20_fwd | Convolution | [256,256,14,14] | 3405101.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 262.67 | 269484032 | 151335445.33 | 54246570.67 | 48.20 | 1.31 | 1025.95 | true | 0.482560;0.482283;0.482179;0.482024;0.482698 | 269484032;269484032;269484032;269484032;269484032 | 151350208;151326912;151352640;151269408;151329216 | 54309248;54074528;54320576;54164160;54266304 | |
183 | resnetv24_stage3_conv20_fwd | Convolution | [256,256,14,14] | 3405101.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9331701.33 | 30.90 | 0.45 | 325.01 | true | 0.310187;0.307667;0.309969;0.309052;0.303644 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9310368;9380736;9340960;9217792;9343776 | |
184 | resnetv24_stage3_batchnorm20_fwd | BatchNorm | [256,256,14,14] | 6274 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51769216.00 | 55918890.67 | 88.20 | 0.75 | 568.28 | true | 0.880955;0.882826;0.881505;0.881272;0.881741 | 81264640;81264640;81264640;81264640;81264640 | 51780992;51766944;51756224;51767488;51773216 | 55883648;55914112;55943392;55940000;55902560 | |
185 | resnetv24_stage3_activation20 | Activation | [256,256,14,14] | 2714 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51370592.00 | 95.70 | 0.25 | 188.90 | true | 0.956649;0.957387;0.956635;0.956820;0.956858 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380576;51380576;51380576;51380576 | 51373984;51382432;51365440;51369216;51368576 | |
186 | resnetv24_stage3_conv21_fwd | Convolution | [256,256,14,14] | 1417471.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2079.67 | 26409435136 | 155892576.00 | 153875957.33 | 24.90 | 85.26 | 12698.88 | false | 0.249012;0.248999;0.248889;0.249029;0.248868 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 165643296;155529792;155247584;154266592;156900352 | 155299200;153842016;150828992;153207648;154578208 | |
186 | resnetv24_stage3_conv21_fwd | Convolution | [256,256,14,14] | 1417471.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.057874;0.058072;0.057896;0.058095;0.057894 | 0;0;0;0;0 | 2688;2432;2816;2432;2688 | 96;96;96;96;96 | |
187 | resnetv24_stage3__plus6 | elemwise_add | [256,1024,14,14] | 13225.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.67 | 51380224 | 3398176.00 | 3977589.33 | 95.70 | 6.97 | 69.37 | true | 0.957072;0.957134;0.956978;0.957071;0.957063 | 51380224;51380224;51380224;51380224;51380224 | 1049600;3801280;1393184;5814272;5000064 | 1611136;4394080;1961216;6340192;5577472 | |
188 | resnetv24_stage3_batchnorm21_fwd | BatchNorm | [256,1024,14,14] | 25179.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 57210325.33 | 60478432.00 | 90.20 | 2.76 | 586.40 | true | 0.902183;0.901735;0.901938;0.901994;0.902382 | 325058560;325058560;325058560;325058560;325058560 | 48581312;61523744;68006336;48579424;61525920 | 52334368;62796416;69775200;55831616;62807264 | |
189 | resnetv24_stage3_activation21 | Activation | [256,1024,14,14] | 11041 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.00 | 102760448 | 67438101.33 | 68505738.67 | 98.70 | 0.76 | 190.30 | true | 0.987100;0.987375;0.987315;0.987799;0.987744 | 102760448;102760448;102760448;102760448;102760448 | 77065088;64228992;89915168;57801184;64223136 | 77070464;64229472;89915808;57803680;61014368 | |
190 | resnetv24_stage3_conv22_fwd | Convolution | [256,1024,14,14] | 1482188.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2097.00 | 26332364800 | 156241258.67 | 37740469.33 | 24.80 | 135.75 | 12557.16 | false | 0.247545;0.248388;0.246747;0.247756;0.247236 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 37715040;37751360;37701728;37760896;37755008 | 156727040;154796032;163019456;154031520;157200704 | |
190 | resnetv24_stage3_conv22_fwd | Convolution | [256,1024,14,14] | 1482188.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057896;0.058501;0.057883;0.058061;0.057933 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2432 | |
191 | resnetv24_stage3_batchnorm22_fwd | BatchNorm | [256,256,14,14] | 6337.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52033877.33 | 55784789.33 | 88.30 | 0.75 | 566.96 | true | 0.883146;0.882284;0.882715;0.883061;0.882460 | 81264640;81264640;81264640;81264640;81264640 | 52049152;52029120;52031968;52030400;52039264 | 55744928;55848064;55792000;55787456;55774912 | |
192 | resnetv24_stage3_activation22 | Activation | [256,256,14,14] | 2838.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51370869.33 | 95.80 | 0.25 | 188.44 | true | 0.959306;0.956971;0.958438;0.956546;0.957561 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51369024;51368512;51372800;51370784;51375712 | |
193 | resnetv24_stage3_conv23_fwd | Convolution | [256,256,14,14] | 3414353.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.67 | 19365101568 | 84587893.33 | 77738677.33 | 24.70 | 119.30 | 12930.18 | false | 0.246773;0.246770;0.246744;0.246844;0.246488 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 80259936;83267552;88778304;82241600;88254528 | 77476672;79077760;77774464;77696928;77744640 | |
193 | resnetv24_stage3_conv23_fwd | Convolution | [256,256,14,14] | 3414353.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.33 | 251658240 | 51391306.67 | 150616736.00 | 47.10 | 1.25 | 914.01 | true | 0.470780;0.471296;0.471581;0.470869;0.469949 | 251658240;251658240;251658240;251658240;251658240 | 150639072;150584128;150636352;150589792;150624064 | 51382304;51390752;51386720;51396448;51424864 | |
193 | resnetv24_stage3_conv23_fwd | Convolution | [256,256,14,14] | 3414353.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.67 | 269484032 | 151315712.00 | 54220160.00 | 48.20 | 1.31 | 1029.87 | true | 0.482780;0.481631;0.481986;0.482086;0.482208 | 269484032;269484032;269484032;269484032;269484032 | 54226336;54215520;54227392;54218624;54194272 | 151268032;151350336;151333376;151267808;151345728 | |
193 | resnetv24_stage3_conv23_fwd | Convolution | [256,256,14,14] | 3414353.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9297674.67 | 30.30 | 0.46 | 325.01 | true | 0.300593;0.301837;0.305317;0.307685;0.303078 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9273856;9320320;9294848;9324704;9277856 | |
194 | resnetv24_stage3_batchnorm23_fwd | BatchNorm | [256,256,14,14] | 6316 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.67 | 81264640 | 51773376.00 | 55897173.33 | 88.20 | 0.75 | 561.74 | true | 0.881636;0.882298;0.882285;0.881310;0.882090 | 81264640;81264640;81264640;81264640;81264640 | 51761024;51761888;51775008;51783232;51784992 | 55879968;55943552;55898432;55913120;55848608 | |
195 | resnetv24_stage3_activation23 | Activation | [256,256,14,14] | 2757.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380928.00 | 51372320.00 | 95.70 | 0.25 | 188.90 | true | 0.957349;0.957253;0.957470;0.956311;0.956163 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380576;51382624;51381600;51380576 | 51367840;51364192;51380224;51368896;51383808 | |
196 | resnetv24_stage3_conv24_fwd | Convolution | [256,256,14,14] | 1426878.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2084.67 | 26409435136 | 157298720.00 | 152737120.00 | 24.90 | 85.18 | 12668.42 | false | 0.248988;0.248919;0.248934;0.249064;0.248875 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 159404544;155364928;157243072;154906048;159288160 | 154403456;151258400;153240288;151249568;153712672 | |
196 | resnetv24_stage3_conv24_fwd | Convolution | [256,256,14,14] | 1426878.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057907;0.058077;0.058466;0.058087;0.057924 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2304 | |
197 | resnetv24_stage3__plus7 | elemwise_add | [256,1024,14,14] | 13183 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.00 | 51380224 | 2819776.00 | 3411200.00 | 95.60 | 8.25 | 69.43 | true | 0.956560;0.956299;0.956550;0.956285;0.956348 | 51380224;51380224;51380224;51380224;51380224 | 3757664;2519488;4216608;2182176;5248 | 4385152;3065152;4781088;2783296;1536 | |
198 | resnetv24_stage3_batchnorm24_fwd | BatchNorm | [256,1024,14,14] | 24956.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 56129610.67 | 60488458.67 | 90.20 | 2.79 | 586.40 | true | 0.902071;0.901961;0.902083;0.902193;0.902154 | 325058560;325058560;325058560;325058560;325058560 | 58288864;58284544;51815424;51810496;74474624 | 62818496;62803040;55843840;55838368;76755136 | |
199 | resnetv24_stage3_activation24 | Activation | [256,1024,14,14] | 10780.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.33 | 102760448 | 67437429.33 | 69576405.33 | 98.70 | 0.75 | 189.83 | true | 0.987574;0.987522;0.987141;0.986642;0.987042 | 102760448;102760448;102760448;102760448;102760448 | 61014144;80281952;73861248;67436896;57803104 | 64221120;80279200;73864704;70643392;57799808 | |
200 | resnetv24_stage3_conv25_fwd | Convolution | [256,1024,14,14] | 1477401 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.67 | 26332364800 | 156944917.33 | 37782752.00 | 24.80 | 135.23 | 12565.15 | false | 0.247499;0.248067;0.247423;0.247104;0.248148 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 37743456;37135712;37881824;37872480;37732320 | 152882496;157941632;156168960;158915808;156724160 | |
200 | resnetv24_stage3_conv25_fwd | Convolution | [256,1024,14,14] | 1477401 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2730.67 | 5.80 | 0.00 | 0.00 | true | 0.057902;0.058037;0.057971;0.058094;0.057904 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2944;2816;2432;2688 | |
201 | resnetv24_stage3_batchnorm25_fwd | BatchNorm | [256,256,14,14] | 6268 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52048906.67 | 55754848.00 | 88.20 | 0.75 | 565.65 | true | 0.882601;0.881835;0.881839;0.882012;0.882554 | 81264640;81264640;81264640;81264640;81264640 | 52055584;52031392;52055520;52038176;52053024 | 55704000;55813504;55727872;55762112;55774560 | |
202 | resnetv24_stage3_activation25 | Activation | [256,256,14,14] | 2725 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51373386.67 | 95.70 | 0.25 | 187.98 | true | 0.957956;0.957533;0.957070;0.955721;0.955891 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51384160;51380576;51380576 | 51360544;51369248;51395744;51382304;51368608 | |
203 | resnetv24_stage3_conv26_fwd | Convolution | [256,256,14,14] | 3401879.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1505.00 | 19365101568 | 82658784.00 | 77597429.33 | 24.70 | 120.84 | 12867.18 | false | 0.246857;0.246469;0.246639;0.247137;0.246440 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 77681984;77802144;77757056;75852896;77353248 | 84241664;90216320;81234944;80736416;82499744 | |
203 | resnetv24_stage3_conv26_fwd | Convolution | [256,256,14,14] | 3401879.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.67 | 251658240 | 51411872.00 | 150582432.00 | 47.10 | 1.25 | 912.91 | true | 0.470511;0.471073;0.471091;0.471432;0.470625 | 251658240;251658240;251658240;251658240;251658240 | 51410400;51418848;51399968;51406368;51434464 | 150589632;150591552;150566752;150590912;150555616 | |
203 | resnetv24_stage3_conv26_fwd | Convolution | [256,256,14,14] | 3401879.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.67 | 269484032 | 151323765.33 | 54271424.00 | 48.20 | 1.31 | 1022.06 | true | 0.482514;0.481725;0.482225;0.482894;0.482548 | 269484032;269484032;269484032;269484032;269484032 | 151235040;151321216;151352064;151313568;151336512 | 54120832;54282176;54265056;54267040;54287264 | |
203 | resnetv24_stage3_conv26_fwd | Convolution | [256,256,14,14] | 3401879.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9351648.00 | 30.20 | 0.45 | 325.01 | true | 0.302920;0.300903;0.309343;0.301146;0.302135 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9345696;9367424;9341600;9341824;9387136 | |
204 | resnetv24_stage3_batchnorm26_fwd | BatchNorm | [256,256,14,14] | 6323.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 51776192.00 | 55925461.33 | 88.10 | 0.75 | 565.65 | true | 0.881045;0.880784;0.881479;0.880844;0.880784 | 81264640;81264640;81264640;81264640;81264640 | 51770848;51776256;51781472;51794464;51766112 | 55925216;55921504;55929664;55912064;55965888 | |
205 | resnetv24_stage3_activation26 | Activation | [256,256,14,14] | 2745.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51370730.67 | 95.70 | 0.25 | 188.44 | true | 0.957309;0.956843;0.956847;0.957655;0.957017 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380640;51380576;51380576 | 51372032;51372960;51370304;51369856;51361696 | |
206 | resnetv24_stage3_conv27_fwd | Convolution | [256,256,14,14] | 1415846 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2073.67 | 26409435136 | 154751861.33 | 152193184.00 | 24.90 | 86.04 | 12735.62 | false | 0.249039;0.248974;0.248891;0.248996;0.248922 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 154480384;155401312;153914336;155725056;154373888 | 152475520;152568256;153065952;151535776;150389344 | |
206 | resnetv24_stage3_conv27_fwd | Convolution | [256,256,14,14] | 1415846 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2560.00 | 5.80 | 0.00 | 0.00 | true | 0.057904;0.058060;0.057910;0.058066;0.057876 | 0;0;0;0;0 | 96;6752;96;96;96 | 2432;16640;2560;2688;2432 | |
207 | resnetv24_stage3__plus8 | elemwise_add | [256,1024,14,14] | 13209.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 4288394.67 | 4891562.67 | 95.70 | 5.60 | 69.34 | true | 0.956833;0.956845;0.956995;0.956928;0.956873 | 51380224;51380224;51380224;51380224;51380224 | 5901344;1763040;4443072;5416768;3005344 | 6505184;2302656;5040736;6002496;3631456 | |
208 | resnetv24_stage3_batchnorm27_fwd | BatchNorm | [256,1024,14,14] | 25108 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 553.67 | 325058560 | 57210464.00 | 60479797.33 | 90.00 | 2.76 | 587.10 | true | 0.900512;0.900627;0.900465;0.900382;0.900492 | 325058560;325058560;325058560;325058560;325058560 | 61522496;64762336;61526336;48581088;48582560 | 62793280;69782304;62808384;55837728;52347936 | |
209 | resnetv24_stage3_activation27 | Activation | [256,1024,14,14] | 10769.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.00 | 102760448 | 64225674.67 | 64221301.33 | 98.70 | 0.80 | 189.59 | true | 0.987553;0.987088;0.987612;0.987460;0.987030 | 102760448;102760448;102760448;102760448;102760448 | 96338336;57803104;64225696;64225696;64225632 | 96334432;57803616;64220032;64221920;64221952 | |
210 | resnetv24_stage3_conv28_fwd | Convolution | [256,1024,14,14] | 1480786.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.00 | 26332364800 | 158486677.33 | 37760586.67 | 24.80 | 134.18 | 12563.15 | false | 0.247409;0.247452;0.247826;0.247657;0.247769 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 168467200;158275776;159586688;155736800;157597568 | 37764224;37717376;37774016;37827072;37743520 | |
210 | resnetv24_stage3_conv28_fwd | Convolution | [256,1024,14,14] | 1480786.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057940;0.058068;0.057881;0.058063;0.057917 | 0;0;0;0;0 | 96;96;96;4192;96 | 2432;2432;2432;10624;2432 | |
211 | resnetv24_stage3_batchnorm28_fwd | BatchNorm | [256,256,14,14] | 6278 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52049408.00 | 55752426.67 | 88.30 | 0.75 | 566.96 | true | 0.883320;0.882583;0.882840;0.882597;0.882435 | 81264640;81264640;81264640;81264640;81264640 | 52065568;52037696;52048960;52030944;52061568 | 55700128;55816768;55698976;55861216;55740384 | |
212 | resnetv24_stage3_activation28 | Activation | [256,256,14,14] | 2740 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 135.67 | 25690112 | 51380576.00 | 51368778.67 | 95.70 | 0.25 | 189.36 | true | 0.956907;0.955738;0.958573;0.956773;0.957826 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51391072;51380576;51380576 | 51366496;51367584;51372256;51389888;51362656 | |
213 | resnetv24_stage3_conv29_fwd | Convolution | [256,256,14,14] | 3413034.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1505.33 | 19365101568 | 85931520.00 | 76996234.67 | 24.70 | 118.86 | 12864.33 | false | 0.246753;0.246395;0.246863;0.246796;0.246898 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 83464832;85157216;85325696;87890560;87311648 | 75514688;77458976;77389376;78534464;76140352 | |
213 | resnetv24_stage3_conv29_fwd | Convolution | [256,256,14,14] | 3413034.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.67 | 251658240 | 51388405.33 | 150632864.00 | 47.00 | 1.25 | 912.91 | true | 0.470425;0.470042;0.471035;0.470192;0.470666 | 251658240;251658240;251658240;251658240;251658240 | 51385888;51393632;51343072;51385696;51399776 | 150644416;150627264;150576832;150626912;150654816 | |
213 | resnetv24_stage3_conv29_fwd | Convolution | [256,256,14,14] | 3413034.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.00 | 269484032 | 151338005.33 | 54290912.00 | 48.20 | 1.31 | 1032.51 | true | 0.482122;0.483204;0.482440;0.482407;0.482148 | 269484032;269484032;269484032;269484032;269484032 | 151312608;151335232;151328992;151349792;151354496 | 54256416;54332928;54183008;54311360;54304960 | |
213 | resnetv24_stage3_conv29_fwd | Convolution | [256,256,14,14] | 3413034.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359893.33 | 9311338.67 | 30.70 | 0.45 | 325.01 | true | 0.300941;0.309028;0.306267;0.307337;0.305960 | 5308416;5308416;5308416;5308416;5308416 | 9267712;9346592;9358848;9311392;9276032 | 2359552;2359552;2361856;2360576;2359552 | |
214 | resnetv24_stage3_batchnorm29_fwd | BatchNorm | [256,256,14,14] | 6302.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.67 | 81264640 | 51780192.00 | 55899114.67 | 88.20 | 0.75 | 561.74 | true | 0.881808;0.881491;0.882964;0.881880;0.881920 | 81264640;81264640;81264640;81264640;81264640 | 55880608;55884128;55936096;55894432;55918784 | 51774464;51787200;51796640;51771264;51778912 | |
215 | resnetv24_stage3_activation29 | Activation | [256,256,14,14] | 2761 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51368725.33 | 95.80 | 0.25 | 188.44 | true | 0.958079;0.957426;0.958297;0.959119;0.956714 | 25690112;25690112;25690112;25690112;25690112 | 51373440;51371328;51369472;51362688;51365376 | 51380576;51380576;51380576;51380576;51380576 | |
216 | resnetv24_stage3_conv30_fwd | Convolution | [256,256,14,14] | 1428295.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2072.33 | 26409435136 | 155059274.67 | 152361824.00 | 24.90 | 85.91 | 12743.82 | false | 0.248926;0.249013;0.248946;0.248951;0.248957 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 152402816;157982464;154104800;155469440;155603584 | 149850400;154281248;151218336;153978976;151888160 | |
216 | resnetv24_stage3_conv30_fwd | Convolution | [256,256,14,14] | 1428295.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2474.67 | 5.80 | 0.00 | 0.00 | true | 0.057932;0.058089;0.057908;0.058037;0.057953 | 0;0;0;0;0 | 96;96;96;96;96 | 2560;2560;2432;2432;2432 | |
217 | resnetv24_stage3__plus9 | elemwise_add | [256,1024,14,14] | 13158.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.00 | 51380224 | 4632405.33 | 5229045.33 | 95.70 | 5.21 | 69.43 | true | 0.956575;0.956435;0.956570;0.956637;0.956558 | 51380224;51380224;51380224;51380224;51380224 | 3813376;4666304;1801600;5417536;5810176 | 4457600;5228512;2379776;6001024;6454720 | |
218 | resnetv24_stage3_batchnorm30_fwd | BatchNorm | [256,1024,14,14] | 24964 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 56131328.00 | 59318890.67 | 90.20 | 2.82 | 586.40 | true | 0.901950;0.902081;0.902099;0.901831;0.902285 | 325058560;325058560;325058560;325058560;325058560 | 48576320;64763776;61528640;58289024;48568768 | 48859168;69794912;62809248;62799136;52348288 | |
219 | resnetv24_stage3_activation30 | Activation | [256,1024,14,14] | 10881.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.33 | 102760448 | 69579989.33 | 70644992.00 | 98.70 | 0.73 | 190.18 | true | 0.986972;0.986999;0.987545;0.987022;0.987525 | 102760448;102760448;102760448;102760448;102760448 | 67437216;70654560;73859136;61014432;70648192 | 70647552;70647392;73870624;61011936;70640032 | |
220 | resnetv24_stage3_conv31_fwd | Convolution | [256,1024,14,14] | 1477525.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.67 | 26332364800 | 156876789.33 | 37796330.67 | 24.80 | 135.26 | 12565.15 | false | 0.247475;0.248231;0.247717;0.248191;0.247425 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 155820032;165620512;158045824;156482016;156102528 | 37858912;37207680;37814944;37784640;37789408 | |
220 | resnetv24_stage3_conv31_fwd | Convolution | [256,1024,14,14] | 1477525.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 4736.00 | 5.80 | 0.00 | 0.00 | true | 0.057888;0.058044;0.057970;0.058094;0.057908 | 0;0;0;0;0 | 96;96;96;5216;96 | 2176;2176;9600;13184;2432 | |
221 | resnetv24_stage3_batchnorm31_fwd | BatchNorm | [256,256,14,14] | 6362 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52043904.00 | 55769984.00 | 88.30 | 0.75 | 566.96 | true | 0.882526;0.881794;0.882709;0.882996;0.882417 | 81264640;81264640;81264640;81264640;81264640 | 55777536;55821920;55772512;55759904;55727936 | 52041504;52014144;52045888;52044320;52051520 | |
222 | resnetv24_stage3_activation31 | Activation | [256,256,14,14] | 2799 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51368202.67 | 95.80 | 0.25 | 188.44 | true | 0.957888;0.957832;0.957472;0.957649;0.957190 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51370944;51369376;51367616;51367616;51366080 | |
223 | resnetv24_stage3_conv32_fwd | Convolution | [256,256,14,14] | 3404890.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.33 | 19365101568 | 83278186.67 | 77808864.00 | 24.70 | 120.22 | 12933.06 | false | 0.246755;0.246387;0.246988;0.246910;0.247130 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 87292320;82363488;89323808;78027296;80178752 | 78019776;78894272;78092256;77314560;77250880 | |
223 | resnetv24_stage3_conv32_fwd | Convolution | [256,256,14,14] | 3404890.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.33 | 251658240 | 51400458.67 | 150579797.33 | 47.10 | 1.25 | 914.01 | true | 0.470011;0.471093;0.470711;0.470226;0.470715 | 251658240;251658240;251658240;251658240;251658240 | 51402080;51414624;51401248;51367072;51398048 | 150580032;150584288;150575072;150510400;150629344 | |
223 | resnetv24_stage3_conv32_fwd | Convolution | [256,256,14,14] | 3404890.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.33 | 269484032 | 151334976.00 | 54172362.67 | 48.20 | 1.31 | 1023.36 | true | 0.482355;0.482186;0.482741;0.481360;0.481894 | 269484032;269484032;269484032;269484032;269484032 | 54251744;54143808;54077536;54321664;54121536 | 151344896;151300160;151329152;151365600;151330880 | |
223 | resnetv24_stage3_conv32_fwd | Convolution | [256,256,14,14] | 3404890.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9342272.00 | 30.30 | 0.45 | 325.01 | true | 0.310146;0.306601;0.302501;0.301224;0.300684 | 5308416;5308416;5308416;5308416;5308416 | 9321504;9332640;9372672;9392128;9289120 | 2359552;2359552;2359552;2359552;2359552 | |
224 | resnetv24_stage3_batchnorm32_fwd | BatchNorm | [256,256,14,14] | 6578 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 51773024.00 | 55921525.33 | 88.20 | 0.75 | 566.96 | true | 0.881673;0.881806;0.881785;0.881016;0.881500 | 81264640;81264640;81264640;81264640;81264640 | 51771648;51772736;51783616;51774688;51755744 | 55930080;55820000;55908704;55943744;55925792 | |
225 | resnetv24_stage3_activation32 | Activation | [256,256,14,14] | 2845 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380586.67 | 51371424.00 | 95.70 | 0.25 | 188.44 | true | 0.956858;0.957426;0.956444;0.956423;0.956961 | 25690112;25690112;25690112;25690112;25690112 | 51372512;51369344;51367648;51372416;51384544 | 51380576;51380576;51380608;51380576;51382880 | |
226 | resnetv24_stage3_conv33_fwd | Convolution | [256,256,14,14] | 1412265 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2075.67 | 26409435136 | 155261973.33 | 154232650.67 | 24.90 | 85.33 | 12723.35 | false | 0.248700;0.249025;0.248919;0.249005;0.249055 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 155766336;153172896;156671904;169399136;153347680 | 153994016;154660544;154043392;155097280;152941344 | |
226 | resnetv24_stage3_conv33_fwd | Convolution | [256,256,14,14] | 1412265 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2645.33 | 5.80 | 0.00 | 0.00 | true | 0.057952;0.058065;0.057926;0.058099;0.057885 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2688;2560;2688 | |
227 | resnetv24_stage3__plus10 | elemwise_add | [256,1024,14,14] | 13252 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 5311712.00 | 5910560.00 | 95.70 | 4.58 | 69.34 | true | 0.957031;0.956996;0.957273;0.956685;0.956956 | 51380224;51380224;51380224;51380224;51380224 | 5726880;1346912;4613536;5659648;5661952 | 6312032;1875040;5249184;6244992;6237504 | |
228 | resnetv24_stage3_batchnorm33_fwd | BatchNorm | [256,1024,14,14] | 25117.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 57211370.67 | 60477728.00 | 90.10 | 2.76 | 585.69 | true | 0.900482;0.900558;0.900413;0.900526;0.900679 | 325058560;325058560;325058560;325058560;325058560 | 62804320;55826048;55823488;62802816;62815040 | 58285664;55050848;51803808;61518080;58297600 | |
229 | resnetv24_stage3_activation33 | Activation | [256,1024,14,14] | 10787.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.33 | 102760448 | 70648106.67 | 69577984.00 | 98.70 | 0.73 | 190.18 | true | 0.987698;0.987322;0.986972;0.987503;0.987005 | 102760448;102760448;102760448;102760448;102760448 | 67436672;99549568;80281952;64225696;64225632 | 64218144;96336480;80278944;64236448;64218560 | |
230 | resnetv24_stage3_conv34_fwd | Convolution | [256,1024,14,14] | 1479397.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.67 | 26332364800 | 156772042.67 | 37806432.00 | 24.80 | 135.33 | 12559.15 | false | 0.247129;0.248146;0.247524;0.247822;0.248507 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 155993120;161158688;154965856;154288352;159357152 | 37814368;37858208;37764544;37803936;37800992 | |
230 | resnetv24_stage3_conv34_fwd | Convolution | [256,1024,14,14] | 1479397.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057935;0.058065;0.057908;0.058044;0.058594 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2176;2432;2432 | |
231 | resnetv24_stage3_batchnorm34_fwd | BatchNorm | [256,256,14,14] | 6491 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52042901.33 | 55780448.00 | 88.30 | 0.75 | 568.28 | true | 0.882778;0.882826;0.882102;0.882356;0.882659 | 81264640;81264640;81264640;81264640;81264640 | 52051008;52042240;52057184;52023392;52035456 | 55753920;55784800;55745728;55802624;55807296 | |
232 | resnetv24_stage3_activation34 | Activation | [256,256,14,14] | 2741.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51373888.00 | 95.70 | 0.25 | 188.90 | true | 0.958056;0.956443;0.955917;0.956956;0.957147 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51379008;51373440;51375616;51372608;51367008 | |
233 | resnetv24_stage3_conv35_fwd | Convolution | [256,256,14,14] | 3409335 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.67 | 19365101568 | 87703178.67 | 77830186.67 | 24.70 | 116.99 | 12930.18 | false | 0.246525;0.246561;0.246498;0.246308;0.246678 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 90692128;79862784;89393952;88714720;85000864 | 77565632;77403072;78521856;79835200;77067488 | |
233 | resnetv24_stage3_conv35_fwd | Convolution | [256,256,14,14] | 3409335 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.67 | 251658240 | 51397685.33 | 150596661.33 | 47.10 | 1.25 | 912.91 | true | 0.471063;0.470761;0.470945;0.470554;0.470189 | 251658240;251658240;251658240;251658240;251658240 | 51393760;51380640;51394272;51422176;51405024 | 150556128;150666080;150548160;150571232;150662624 | |
233 | resnetv24_stage3_conv35_fwd | Convolution | [256,256,14,14] | 3409335 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.00 | 269484032 | 151352725.33 | 54268405.33 | 48.20 | 1.31 | 1032.51 | true | 0.482348;0.482627;0.482251;0.482374;0.483155 | 269484032;269484032;269484032;269484032;269484032 | 54281408;54299712;54283744;54209152;54240064 | 151352992;151361760;151343424;151366048;151328096 | |
233 | resnetv24_stage3_conv35_fwd | Convolution | [256,256,14,14] | 3409335 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9323520.00 | 30.20 | 0.45 | 318.50 | true | 0.299513;0.310028;0.301862;0.301648;0.302178 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9420192;9246624;9362688;9347456;9260416 | |
234 | resnetv24_stage3_batchnorm35_fwd | BatchNorm | [256,256,14,14] | 6317 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 51782602.67 | 55893696.00 | 88.20 | 0.75 | 564.34 | true | 0.882133;0.880847;0.882093;0.881676;0.882237 | 81264640;81264640;81264640;81264640;81264640 | 55848832;55870752;55917472;55899104;55911232 | 51779296;51795616;51779456;51789056;51758912 | |
235 | resnetv24_stage3_activation35 | Activation | [256,256,14,14] | 2745 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51369120.00 | 95.80 | 0.25 | 188.44 | true | 0.958178;0.958722;0.957056;0.958407;0.957086 | 25690112;25690112;25690112;25690112;25690112 | 51368608;51370592;51364256;51370240;51368512 | 51380576;51380576;51380576;51380576;51380576 | |
236 | resnetv24_stage3_conv36_fwd | Convolution | [256,256,14,14] | 1426452.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2081.67 | 26409435136 | 154864469.33 | 152746880.00 | 24.90 | 85.85 | 12686.68 | false | 0.248934;0.248995;0.248723;0.249007;0.248953 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 157942496;154467488;155406880;154719040;152783840 | 155391168;151934208;156185184;150915264;150492160 | |
236 | resnetv24_stage3_conv36_fwd | Convolution | [256,256,14,14] | 1426452.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 2741.33 | 11349.33 | 5.80 | 0.00 | 0.00 | true | 0.057963;0.058097;0.057885;0.058039;0.057949 | 0;0;0;0;0 | 10848;96;6752;96;1376 | 24704;12928;16640;2432;4480 | |
237 | resnetv24_stage3__plus11 | elemwise_add | [256,1024,14,14] | 13248 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.67 | 51380224 | 2017770.67 | 2404480.00 | 95.60 | 11.62 | 69.37 | true | 0.956251;0.956534;0.956689;0.956331;0.956546 | 51380224;51380224;51380224;51380224;51380224 | 128;896;5011360;2743968;3308448 | 544;3040;5600768;3319424;3890976 | |
238 | resnetv24_stage3_batchnorm36_fwd | BatchNorm | [256,1024,14,14] | 24935.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 61531157.33 | 67465578.67 | 90.00 | 2.52 | 586.40 | true | 0.900296;0.900023;0.900247;0.900317;0.900442 | 325058560;325058560;325058560;325058560;325058560 | 61523008;61536992;61533472;48568608;64754400 | 69789440;69792736;62829920;55828128;69777376 | |
239 | resnetv24_stage3_activation36 | Activation | [256,1024,14,14] | 10791 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 543.00 | 102760448 | 67436917.33 | 66360906.67 | 98.70 | 0.77 | 189.25 | true | 0.987190;0.987550;0.987414;0.986923;0.987169 | 102760448;102760448;102760448;102760448;102760448 | 64221440;70639712;64220160;64221568;83492448 | 67436960;67436896;64225344;67436896;86704480 | |
240 | resnetv24_stage3_conv37_fwd | Convolution | [256,1024,14,14] | 1475739.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.33 | 26332364800 | 158647221.33 | 37739626.67 | 24.80 | 134.08 | 12561.16 | false | 0.246493;0.248031;0.247736;0.247719;0.247475 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 37715744;37784448;37819520;37704768;37718688 | 157802592;157127296;161011776;155528160;161587328 | |
240 | resnetv24_stage3_conv37_fwd | Convolution | [256,1024,14,14] | 1475739.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.058249;0.058067;0.057900;0.058113;0.057894 | 0;0;0;0;0 | 96;96;96;96;96 | 16000;2432;2688;2432;2688 | |
241 | resnetv24_stage3_batchnorm37_fwd | BatchNorm | [256,256,14,14] | 6294.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52054752.00 | 55722688.00 | 88.20 | 0.75 | 565.65 | true | 0.882320;0.881955;0.882495;0.882560;0.883261 | 81264640;81264640;81264640;81264640;81264640 | 52054208;52054336;52028320;52064832;52055712 | 55698656;55775040;55794528;55694368;55692192 | |
242 | resnetv24_stage3_activation37 | Activation | [256,256,14,14] | 2774.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51365205.33 | 95.70 | 0.25 | 187.98 | true | 0.958800;0.957887;0.955745;0.956748;0.955888 | 25690112;25690112;25690112;25690112;25690112 | 51370848;51355552;51361216;51363552;51372544 | 51380576;51380576;51380576;51380576;51380576 | |
243 | resnetv24_stage3_conv38_fwd | Convolution | [256,256,14,14] | 3400032.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.00 | 19365101568 | 85724586.67 | 77618144.00 | 24.70 | 118.56 | 12927.30 | false | 0.246256;0.246716;0.246933;0.246738;0.246711 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 87314592;89123840;83490432;86368736;81434016 | 77515328;76943488;77682400;77656704;78097568 | |
243 | resnetv24_stage3_conv38_fwd | Convolution | [256,256,14,14] | 3400032.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.33 | 251658240 | 51394101.33 | 150595381.33 | 47.10 | 1.25 | 914.01 | true | 0.470279;0.471168;0.470884;0.470448;0.470788 | 251658240;251658240;251658240;251658240;251658240 | 51404704;51379488;51395616;51381984;51406112 | 150556000;150586848;150571232;150638944;150628064 | |
243 | resnetv24_stage3_conv38_fwd | Convolution | [256,256,14,14] | 3400032.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 264.00 | 269484032 | 151334688.00 | 54229941.33 | 48.20 | 1.31 | 1020.77 | true | 0.481997;0.481134;0.482809;0.482398;0.482002 | 269484032;269484032;269484032;269484032;269484032 | 151330080;151342432;151340672;151333312;151311008 | 54280256;54223104;54279968;54084256;54186752 | |
243 | resnetv24_stage3_conv38_fwd | Convolution | [256,256,14,14] | 3400032.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9353536.00 | 30.10 | 0.45 | 318.50 | true | 0.299916;0.309248;0.299842;0.302527;0.300760 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9366784;9366176;9372160;9276288;9327648 | |
244 | resnetv24_stage3_batchnorm38_fwd | BatchNorm | [256,256,14,14] | 6332.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 51765141.33 | 55926336.00 | 88.10 | 0.75 | 565.65 | true | 0.881304;0.881556;0.882270;0.881558;0.881342 | 81264640;81264640;81264640;81264640;81264640 | 51766624;51766112;51774784;51760640;51762688 | 55936320;55933792;55897248;55908896;55953920 | |
245 | resnetv24_stage3_activation38 | Activation | [256,256,14,14] | 2749.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51367968.00 | 95.70 | 0.25 | 188.90 | true | 0.957330;0.956830;0.956762;0.956656;0.957300 | 25690112;25690112;25690112;25690112;25690112 | 51370720;51366240;51367200;51368416;51368288 | 51380576;51380576;51380576;51380576;51380640 | |
246 | resnetv24_stage3_conv39_fwd | Convolution | [256,256,14,14] | 1414750 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2082.33 | 26409435136 | 154712853.33 | 151794602.67 | 24.90 | 86.16 | 12682.62 | false | 0.248957;0.249050;0.248833;0.248992;0.248945 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 149839104;153163552;153193952;151866720;150353536 | 152615936;160146528;154130656;156494624;153513280 | |
246 | resnetv24_stage3_conv39_fwd | Convolution | [256,256,14,14] | 1414750 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057903;0.058053;0.057917;0.058099;0.057892 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2432;9472;2432 | |
247 | resnetv24_stage3__plus12 | elemwise_add | [256,1024,14,14] | 13236 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 2276149.33 | 2810069.33 | 95.70 | 10.10 | 69.34 | true | 0.957284;0.956810;0.956734;0.957183;0.956993 | 51380224;51380224;51380224;51380224;51380224 | 2968512;160;5332864;519456;3340480 | 3563584;1120;5945632;960352;3906272 | |
248 | resnetv24_stage3_batchnorm39_fwd | BatchNorm | [256,1024,14,14] | 25116 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 59317152.00 | 65103626.67 | 90.00 | 2.61 | 585.69 | true | 0.900750;0.900157;0.900030;0.900242;0.900513 | 325058560;325058560;325058560;325058560;325058560 | 58238240;61470208;58243008;67939456;48534400 | 62773248;69759104;62778528;76719776;52314368 | |
249 | resnetv24_stage3_activation39 | Activation | [256,1024,14,14] | 10791.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 543.33 | 102760448 | 68507264.00 | 67432469.33 | 98.70 | 0.76 | 189.13 | true | 0.987306;0.987550;0.987292;0.987199;0.986815 | 102760448;102760448;102760448;102760448;102760448 | 64220480;73856320;61011360;86695616;64220608 | 64225408;73859424;61014080;86704480;67436960 | |
250 | resnetv24_stage3_conv40_fwd | Convolution | [256,1024,14,14] | 1479881.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2093.67 | 26332364800 | 158603626.67 | 37775040.00 | 24.80 | 134.09 | 12577.15 | false | 0.248394;0.248080;0.247617;0.247598;0.247508 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 163677952;158578528;158632160;154264832;158600192 | 37798944;37801568;37764992;37761184;37709696 | |
250 | resnetv24_stage3_conv40_fwd | Convolution | [256,1024,14,14] | 1479881.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057919;0.058094;0.058231;0.058063;0.057926 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2688;2432 | |
251 | resnetv24_stage3_batchnorm40_fwd | BatchNorm | [256,256,14,14] | 6265.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52050197.33 | 55758922.67 | 88.30 | 0.75 | 566.96 | true | 0.882861;0.882021;0.882841;0.882070;0.882776 | 81264640;81264640;81264640;81264640;81264640 | 52043808;52049952;52030080;52057024;52056832 | 55755136;55748896;55826016;55772736;55690592 | |
252 | resnetv24_stage3_activation40 | Activation | [256,256,14,14] | 2765 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51369280.00 | 95.70 | 0.25 | 188.90 | true | 0.958911;0.957589;0.956684;0.956440;0.956280 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380576;51380576;51380576 | 51370432;51370144;51367264;51363456;51372288 | |
253 | resnetv24_stage3_conv41_fwd | Convolution | [256,256,14,14] | 3412094 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1511.33 | 19365101568 | 84528128.00 | 77194965.33 | 24.70 | 119.74 | 12813.26 | false | 0.246592;0.246867;0.246766;0.246712;0.246551 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 82569888;83991200;87023296;88124960;81644128 | 70584800;76583712;77688640;77784832;77312544 | |
253 | resnetv24_stage3_conv41_fwd | Convolution | [256,256,14,14] | 3412094 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.00 | 251658240 | 51394997.33 | 150569301.33 | 47.10 | 1.25 | 915.12 | true | 0.470692;0.471222;0.471091;0.470683;0.470689 | 251658240;251658240;251658240;251658240;251658240 | 51391904;51387616;51380704;51405472;51408480 | 150560224;150563040;150619872;150581568;150563296 | |
253 | resnetv24_stage3_conv41_fwd | Convolution | [256,256,14,14] | 3412094 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.00 | 269484032 | 151323061.33 | 54174538.67 | 48.30 | 1.31 | 1032.51 | true | 0.482768;0.482770;0.482294;0.482677;0.481687 | 269484032;269484032;269484032;269484032;269484032 | 151319904;151291040;151300896;151348384;151360128 | 54056736;54156992;54230560;54136064;54261632 | |
253 | resnetv24_stage3_conv41_fwd | Convolution | [256,256,14,14] | 3412094 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.00 | 5308416 | 2359552.00 | 9360437.33 | 30.60 | 0.45 | 331.78 | true | 0.308610;0.309106;0.303014;0.307471;0.301695 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9375744;9380352;9310592;9340544;9365024 | |
254 | resnetv24_stage3_batchnorm41_fwd | BatchNorm | [256,256,14,14] | 6263 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.67 | 81264640 | 51749845.33 | 55902112.00 | 88.20 | 0.75 | 561.74 | true | 0.882263;0.882635;0.881656;0.882097;0.881834 | 81264640;81264640;81264640;81264640;81264640 | 51743648;51779392;51757856;51735456;51748032 | 55895872;55884736;55904704;55905760;55910688 | |
255 | resnetv24_stage3_activation41 | Activation | [256,256,14,14] | 2745.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 135.67 | 25690112 | 51380576.00 | 51368074.67 | 95.70 | 0.25 | 189.36 | true | 0.958022;0.958388;0.956435;0.956261;0.956903 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380576;51380576;51380576 | 51367072;51362784;51371904;51370048;51367104 | |
256 | resnetv24_stage3_conv42_fwd | Convolution | [256,256,14,14] | 1426621.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2082.00 | 26409435136 | 154757717.33 | 151525269.33 | 24.90 | 86.23 | 12684.65 | false | 0.248939;0.248983;0.248970;0.248952;0.248957 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 155243264;155100064;153929824;151421984;156685280 | 152492288;151979104;151280736;149988768;151315968 | |
256 | resnetv24_stage3_conv42_fwd | Convolution | [256,256,14,14] | 1426621.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057953;0.058085;0.057924;0.058546;0.057932 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2432 | |
257 | resnetv24_stage3__plus13 | elemwise_add | [256,1024,14,14] | 13187.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.33 | 51380224 | 4258261.33 | 4879648.00 | 95.70 | 5.62 | 69.40 | true | 0.956592;0.956512;0.956147;0.956694;0.956418 | 51380224;51380224;51380224;51380224;51380224 | 6188480;1219456;2222304;5394688;5157792 | 6783296;1860224;2821408;5989216;5828320 | |
258 | resnetv24_stage3_batchnorm42_fwd | BatchNorm | [256,1024,14,14] | 24785.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 59371882.67 | 62816309.33 | 90.00 | 2.66 | 586.40 | true | 0.900652;0.900297;0.900364;0.900523;0.900202 | 325058560;325058560;325058560;325058560;325058560 | 61533952;64776640;51805056;87428544;48579584 | 62804576;69799712;55821024;94203968;55844640 | |
259 | resnetv24_stage3_activation42 | Activation | [256,1024,14,14] | 10858.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 539.33 | 102760448 | 65296117.33 | 64227146.67 | 98.70 | 0.79 | 190.53 | true | 0.987123;0.987158;0.987319;0.987419;0.986977 | 102760448;102760448;102760448;102760448;102760448 | 64226016;57799552;89908320;64222912;64232512 | 64225696;61014368;89915744;64225696;67436960 | |
260 | resnetv24_stage3_conv43_fwd | Convolution | [256,1024,14,14] | 1476783.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2094.00 | 26332364800 | 156637738.67 | 37753152.00 | 24.80 | 135.46 | 12575.15 | false | 0.247431;0.247874;0.247649;0.247753;0.247080 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 153772704;156162848;160228512;156096352;157654016 | 34773472;37855136;37792608;37717568;37749280 | |
260 | resnetv24_stage3_conv43_fwd | Convolution | [256,1024,14,14] | 1476783.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057914;0.058090;0.057900;0.058087;0.058203 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2432;2688;2432 | |
261 | resnetv24_stage3_batchnorm43_fwd | BatchNorm | [256,256,14,14] | 6237.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52048448.00 | 55755146.67 | 88.20 | 0.75 | 568.28 | true | 0.882351;0.882540;0.883268;0.882031;0.881594 | 81264640;81264640;81264640;81264640;81264640 | 52056544;52044512;52040800;52072096;52044288 | 55732544;55776160;55756736;55679680;55778080 | |
262 | resnetv24_stage3_activation43 | Activation | [256,256,14,14] | 2746.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51375008.00 | 95.60 | 0.25 | 188.44 | true | 0.956102;0.956865;0.956884;0.956451;0.955093 | 25690112;25690112;25690112;25690112;25690112 | 51371840;51370368;51375968;51377216;51377696 | 51380608;51380576;51380576;51380576;51380576 | |
263 | resnetv24_stage3_conv44_fwd | Convolution | [256,256,14,14] | 3400760 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1505.00 | 19365101568 | 84940629.33 | 77659328.00 | 24.70 | 119.10 | 12867.18 | false | 0.246905;0.246945;0.246786;0.246868;0.246908 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 78248928;89268384;79480864;86072640;89458368 | 77802240;77532864;76414208;77732992;77712128 | |
263 | resnetv24_stage3_conv44_fwd | Convolution | [256,256,14,14] | 3400760 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.00 | 251658240 | 51406090.67 | 150584693.33 | 47.10 | 1.25 | 911.81 | true | 0.470135;0.470498;0.471833;0.471158;0.470485 | 251658240;251658240;251658240;251658240;251658240 | 51399328;51415712;51345568;51425824;51403232 | 150619072;150571968;150598464;150583648;150562880 | |
263 | resnetv24_stage3_conv44_fwd | Convolution | [256,256,14,14] | 3400760 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.33 | 269484032 | 151337674.67 | 54220810.67 | 48.30 | 1.31 | 1023.36 | true | 0.482411;0.482089;0.482816;0.482737;0.482814 | 269484032;269484032;269484032;269484032;269484032 | 151358848;151343712;151344128;151227104;151325184 | 54179392;54244224;54247840;54238816;53922688 | |
263 | resnetv24_stage3_conv44_fwd | Convolution | [256,256,14,14] | 3400760 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9331082.67 | 30.40 | 0.45 | 318.50 | true | 0.304047;0.301516;0.303525;0.308664;0.303234 | 5308416;5308416;5308416;5308416;5308416 | 9322528;9331744;9327232;9334272;9349152 | 2359552;2359552;2359552;2359552;2359552 | |
264 | resnetv24_stage3_batchnorm44_fwd | BatchNorm | [256,256,14,14] | 6331.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51767232.00 | 55927093.33 | 88.20 | 0.75 | 568.28 | true | 0.882131;0.881217;0.881836;0.881747;0.881361 | 81264640;81264640;81264640;81264640;81264640 | 51767424;51780192;51767296;51766976;51749152 | 55855680;55915040;55908032;55995584;55958208 | |
265 | resnetv24_stage3_activation44 | Activation | [256,256,14,14] | 2745 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 135.67 | 25690112 | 51380576.00 | 51369408.00 | 95.80 | 0.25 | 189.36 | true | 0.958186;0.956062;0.955683;0.958428;0.958557 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51370752;51366496;51371552;51362368;51370976 | |
266 | resnetv24_stage3_conv45_fwd | Convolution | [256,256,14,14] | 1414239.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2081.33 | 26409435136 | 157281077.33 | 153129610.67 | 24.90 | 85.08 | 12688.71 | false | 0.248991;0.248960;0.248872;0.249025;0.249076 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 153035680;152722880;153630272;154280064;152018208 | 156625696;157964960;157926976;153810624;157290560 | |
266 | resnetv24_stage3_conv45_fwd | Convolution | [256,256,14,14] | 1414239.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.057956;0.058084;0.057923;0.058097;0.057930 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2688;2432;2688 | |
267 | resnetv24_stage3__plus14 | elemwise_add | [256,1024,14,14] | 13264.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 3347680.00 | 3960352.00 | 95.70 | 7.03 | 69.34 | true | 0.956812;0.957189;0.957127;0.956691;0.956729 | 51380224;51380224;51380224;51380224;51380224 | 544;4628448;5435264;5600288;1817344 | 128;3992064;4812576;4996480;1238400 | |
268 | resnetv24_stage3_batchnorm45_fwd | BatchNorm | [256,1024,14,14] | 25072 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 56083797.33 | 60458026.67 | 90.00 | 2.79 | 585.69 | true | 0.900320;0.900441;0.900332;0.900450;0.900369 | 325058560;325058560;325058560;325058560;325058560 | 48532896;61481472;51772608;58242496;58236288 | 52327936;69770528;55806656;62786112;62781312 | |
269 | resnetv24_stage3_activation45 | Activation | [256,1024,14,14] | 10845.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.00 | 102760448 | 68507285.33 | 67431413.33 | 98.70 | 0.76 | 189.59 | true | 0.987265;0.987543;0.987705;0.986932;0.987240 | 102760448;102760448;102760448;102760448;102760448 | 70647936;67436960;67436896;73859424;67436960 | 70642496;64222016;67428416;77067712;64223328 | |
270 | resnetv24_stage3_conv46_fwd | Convolution | [256,1024,14,14] | 1480547.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2094.33 | 26332364800 | 158148608.00 | 37772469.33 | 24.70 | 134.40 | 12573.15 | false | 0.247401;0.247387;0.247673;0.247460;0.247219 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 158164064;161272384;154880832;156505504;159776256 | 37751264;37731616;37750656;37819712;37815488 | |
270 | resnetv24_stage3_conv46_fwd | Convolution | [256,1024,14,14] | 1480547.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057903;0.058070;0.058092;0.058039;0.057924 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2432 | |
271 | resnetv24_stage3_batchnorm46_fwd | BatchNorm | [256,256,14,14] | 6298.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52042346.67 | 55776682.67 | 88.30 | 0.75 | 568.28 | true | 0.882467;0.883178;0.883317;0.882828;0.882692 | 81264640;81264640;81264640;81264640;81264640 | 52065920;52045504;52038304;52025952;52043232 | 55672000;55768160;55849152;55827616;55734272 | |
272 | resnetv24_stage3_activation46 | Activation | [256,256,14,14] | 2758 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380586.67 | 51372192.00 | 95.70 | 0.25 | 188.90 | true | 0.956636;0.956298;0.957036;0.957265;0.956001 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380608;51380576;51380576 | 51360352;51370336;51382720;51374880;51371360 | |
273 | resnetv24_stage3_conv47_fwd | Convolution | [256,256,14,14] | 3411460.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1505.33 | 19365101568 | 84447925.33 | 77478592.00 | 24.70 | 119.59 | 12864.33 | false | 0.246917;0.246798;0.246947;0.246749;0.246641 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 77410720;75201312;77474592;77877888;77550464 | 84361088;82882432;86084032;82898656;89448384 | |
273 | resnetv24_stage3_conv47_fwd | Convolution | [256,256,14,14] | 3411460.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.00 | 251658240 | 51386314.67 | 150605013.33 | 47.10 | 1.25 | 915.12 | true | 0.470729;0.470809;0.470988;0.471173;0.470128 | 251658240;251658240;251658240;251658240;251658240 | 150593472;150605664;150642624;150569280;150615904 | 51369376;51366176;51395616;51393952;51421920 | |
273 | resnetv24_stage3_conv47_fwd | Convolution | [256,256,14,14] | 3411460.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.67 | 269484032 | 151333098.67 | 54240597.33 | 48.20 | 1.31 | 1029.87 | true | 0.482186;0.482213;0.482506;0.482368;0.482167 | 269484032;269484032;269484032;269484032;269484032 | 151289920;151316544;151344064;151350592;151338688 | 54295776;54292480;54232512;54177376;54196800 | |
273 | resnetv24_stage3_conv47_fwd | Convolution | [256,256,14,14] | 3411460.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9322421.33 | 30.60 | 0.45 | 318.50 | true | 0.299521;0.307232;0.310424;0.307187;0.304292 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9336960;9336736;9287808;9373440;9293568 | |
274 | resnetv24_stage3_batchnorm47_fwd | BatchNorm | [256,256,14,14] | 6291 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 51776352.00 | 55905002.67 | 88.20 | 0.75 | 564.34 | true | 0.882185;0.882368;0.881691;0.881749;0.882095 | 81264640;81264640;81264640;81264640;81264640 | 51787296;51774304;51787520;51766848;51767456 | 55880448;55939936;55879488;55920096;55914464 | |
275 | resnetv24_stage3_activation47 | Activation | [256,256,14,14] | 2718.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 135.67 | 25690112 | 51380576.00 | 51367360.00 | 95.70 | 0.25 | 189.36 | true | 0.957411;0.956905;0.957072;0.957178;0.956299 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380832;51380576;51380576;51380576 | 51364864;51364768;51370784;51381600;51366432 | |
276 | resnetv24_stage3_conv48_fwd | Convolution | [256,256,14,14] | 1425952.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2081.33 | 26409435136 | 155805717.33 | 151859125.33 | 24.90 | 85.84 | 12688.71 | false | 0.248683;0.248906;0.248934;0.248573;0.248996 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 155393024;153456768;159097376;157142880;154881248 | 150620672;151183264;153738400;153265088;151129024 | |
276 | resnetv24_stage3_conv48_fwd | Convolution | [256,256,14,14] | 1425952.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2773.33 | 5.80 | 0.00 | 0.00 | true | 0.057912;0.058256;0.057897;0.058058;0.057916 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;4224;3456;2432 | |
277 | resnetv24_stage3__plus15 | elemwise_add | [256,1024,14,14] | 13152.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.00 | 51380224 | 1927733.33 | 2499253.33 | 95.60 | 11.61 | 69.43 | true | 0.956276;0.956319;0.956226;0.956446;0.956821 | 51380224;51380224;51380224;51380224;51380224 | 2719520;2105152;128;958528;2846624 | 3278464;2683424;1024;1535872;3421568 | |
278 | resnetv24_stage3_batchnorm48_fwd | BatchNorm | [256,1024,14,14] | 24950 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 59316170.67 | 62789322.67 | 90.00 | 2.66 | 585.69 | true | 0.900327;0.900790;0.900386;0.900338;0.900387 | 325058560;325058560;325058560;325058560;325058560 | 61474560;54999200;61474752;74415072;51772896 | 62790688;55817120;69760160;83712992;55800224 | |
279 | resnetv24_stage3_activation48 | Activation | [256,1024,14,14] | 10784.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.67 | 102760448 | 68507146.67 | 70646229.33 | 98.70 | 0.74 | 189.71 | true | 0.986957;0.987690;0.987374;0.987430;0.986883 | 102760448;102760448;102760448;102760448;102760448 | 61014144;96338336;64225344;80281952;54591904 | 64224096;96335200;64224832;83489760;57801504 | |
280 | resnetv24_stage3_conv49_fwd | Convolution | [256,1024,14,14] | 1476569.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2094.00 | 26332364800 | 157954794.67 | 37718485.33 | 24.80 | 134.57 | 12575.15 | false | 0.247605;0.247495;0.247454;0.247893;0.247495 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 170220768;157152096;159105024;156453120;157607264 | 37561504;37704480;37763744;37694720;37756256 | |
280 | resnetv24_stage3_conv49_fwd | Convolution | [256,1024,14,14] | 1476569.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.057910;0.058333;0.057942;0.058078;0.057916 | 0;0;0;0;0 | 96;96;96;96;96 | 2816;2432;2688;2432;2688 | |
281 | resnetv24_stage3_batchnorm49_fwd | BatchNorm | [256,256,14,14] | 6323.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52047701.33 | 55767370.67 | 88.30 | 0.75 | 566.96 | true | 0.882797;0.882082;0.883498;0.882944;0.883565 | 81264640;81264640;81264640;81264640;81264640 | 52046816;52046016;52050272;52065792;52040768 | 55797088;55758208;55798048;55670592;55746816 | |
282 | resnetv24_stage3_activation49 | Activation | [256,256,14,14] | 2740.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 135.67 | 25690112 | 51380586.67 | 51366474.67 | 95.70 | 0.25 | 189.36 | true | 0.957264;0.958653;0.956462;0.957695;0.957042 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380608;51380608;51380576 | 51367040;51376768;51361600;51356352;51370784 | |
283 | resnetv24_stage3_conv50_fwd | Convolution | [256,256,14,14] | 3403486.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1505.00 | 19365101568 | 84802688.00 | 77393898.67 | 24.70 | 119.39 | 12867.18 | false | 0.246829;0.247085;0.246874;0.246881;0.246665 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 83011744;89349024;83310944;76893440;88085376 | 75302464;77115552;77389120;77833632;77677024 | |
283 | resnetv24_stage3_conv50_fwd | Convolution | [256,256,14,14] | 3403486.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.67 | 251658240 | 51400373.33 | 150587797.33 | 47.10 | 1.25 | 912.91 | true | 0.471153;0.469988;0.471293;0.471233;0.471622 | 251658240;251658240;251658240;251658240;251658240 | 51389024;51405920;51423840;51402464;51392736 | 150607168;150521664;150596032;150615232;150560192 | |
283 | resnetv24_stage3_conv50_fwd | Convolution | [256,256,14,14] | 3403486.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.67 | 269484032 | 151337930.67 | 54215658.67 | 48.20 | 1.31 | 1022.06 | true | 0.482824;0.482524;0.481976;0.482314;0.481802 | 269484032;269484032;269484032;269484032;269484032 | 151345312;151321216;151345248;151330144;151338400 | 54300224;54136064;54119008;54229760;54281152 | |
283 | resnetv24_stage3_conv50_fwd | Convolution | [256,256,14,14] | 3403486.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9326005.33 | 30.30 | 0.45 | 318.50 | true | 0.299981;0.301190;0.307676;0.307172;0.298057 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359808;2359552 | 9324544;9416096;9308416;9306368;9345056 | |
284 | resnetv24_stage3_batchnorm50_fwd | BatchNorm | [256,256,14,14] | 6294.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51772405.33 | 55939584.00 | 88.20 | 0.75 | 568.28 | true | 0.881605;0.882440;0.881998;0.882008;0.882425 | 81264640;81264640;81264640;81264640;81264640 | 51770496;51776448;51748416;51770272;51788128 | 55949856;55907488;55959136;55912352;55956544 | |
285 | resnetv24_stage3_activation50 | Activation | [256,256,14,14] | 2765.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380597.33 | 51366848.00 | 95.70 | 0.25 | 187.98 | true | 0.957009;0.957624;0.956872;0.957382;0.956954 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380576;51380608;51380608;51380576 | 51367168;51379360;51364480;51368896;51358688 | |
286 | resnetv24_stage3_conv51_fwd | Convolution | [256,256,14,14] | 1417659.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2081.67 | 26409435136 | 156286048.00 | 152200138.67 | 24.90 | 85.61 | 12686.68 | false | 0.248948;0.248982;0.248616;0.248934;0.249009 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 155838912;157732416;155286816;153516576;160627936 | 151369088;153041344;152189984;150355584;154196736 | |
286 | resnetv24_stage3_conv51_fwd | Convolution | [256,256,14,14] | 1417659.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058310;0.058051;0.057962;0.058071;0.057942 | 0;0;0;0;0 | 2432;2688;2432;2688;2432 | 96;96;96;96;96 | |
287 | resnetv24_stage3__plus16 | elemwise_add | [256,1024,14,14] | 13366 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 4082912.00 | 4667029.33 | 95.70 | 5.87 | 69.34 | true | 0.956953;0.957069;0.957048;0.957146;0.956905 | 51380224;51380224;51380224;51380224;51380224 | 5977344;128;1256672;6402048;5014720 | 6540352;800;1826944;6927744;5633792 | |
288 | resnetv24_stage3_batchnorm51_fwd | BatchNorm | [256,1024,14,14] | 25105.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 57206378.67 | 60475509.33 | 90.00 | 2.76 | 585.69 | true | 0.900356;0.900286;0.900491;0.900548;0.900507 | 325058560;325058560;325058560;325058560;325058560 | 62780544;83749568;55837792;62808192;55832608 | 58287648;77723872;55051808;58279680;48569280 | |
289 | resnetv24_stage3_activation51 | Activation | [256,1024,14,14] | 10783.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.00 | 102760448 | 67440085.33 | 68505674.67 | 98.70 | 0.76 | 189.95 | true | 0.986821;0.986871;0.987340;0.987091;0.987097 | 102760448;102760448;102760448;102760448;102760448 | 67446400;73859488;67436896;67436896;67436960 | 64244640;73857984;70651232;67434784;67431008 | |
290 | resnetv24_stage3_conv52_fwd | Convolution | [256,1024,14,14] | 1480979.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.33 | 26332364800 | 156256320.00 | 37772714.67 | 24.80 | 135.71 | 12561.16 | false | 0.247838;0.247499;0.247357;0.248500;0.247368 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 157780288;154587072;157401248;155365152;156002560 | 37848160;37859776;37733632;37730624;37736352 | |
290 | resnetv24_stage3_conv52_fwd | Convolution | [256,1024,14,14] | 1480979.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057914;0.058092;0.057879;0.058056;0.057920 | 0;0;0;0;0 | 7552;2176;2432;2432;2432 | 96;96;96;96;96 | |
291 | resnetv24_stage3_batchnorm52_fwd | BatchNorm | [256,256,14,14] | 6342.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52042954.67 | 55782688.00 | 88.30 | 0.75 | 568.28 | true | 0.882332;0.882902;0.883491;0.882292;0.882559 | 81264640;81264640;81264640;81264640;81264640 | 52030816;52031648;52068896;52059072;52038144 | 55823968;55807296;55682272;55794176;55746592 | |
292 | resnetv24_stage3_activation52 | Activation | [256,256,14,14] | 2771 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51369952.00 | 95.70 | 0.25 | 187.98 | true | 0.956379;0.957083;0.956997;0.958474;0.957738 | 25690112;25690112;25690112;25690112;25690112 | 51370400;51371424;51372000;51364832;51368032 | 51380576;51380576;51380576;51380576;51380608 | |
293 | resnetv24_stage3_conv53_fwd | Convolution | [256,256,14,14] | 3414196 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.67 | 19365101568 | 87053194.67 | 77218368.00 | 24.70 | 117.88 | 12930.18 | false | 0.246956;0.246880;0.246617;0.246904;0.246852 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 84947808;80007840;89314560;87301536;88910240 | 77308224;77723264;77158048;76122944;77188832 | |
293 | resnetv24_stage3_conv53_fwd | Convolution | [256,256,14,14] | 3414196 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.67 | 251658240 | 51401333.33 | 150604074.67 | 47.00 | 1.25 | 912.91 | true | 0.469979;0.470181;0.470721;0.471495;0.470350 | 251658240;251658240;251658240;251658240;251658240 | 51388512;51433824;51390432;51398688;51414880 | 150658272;150595520;150531392;150601312;150615392 | |
293 | resnetv24_stage3_conv53_fwd | Convolution | [256,256,14,14] | 3414196 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.33 | 269484032 | 151331370.67 | 54294378.67 | 48.20 | 1.31 | 1031.19 | true | 0.482777;0.482648;0.481904;0.482065;0.482278 | 269484032;269484032;269484032;269484032;269484032 | 151322848;151345664;151318208;151325600;151355168 | 54013152;54291936;54278464;54312736;54316160 | |
293 | resnetv24_stage3_conv53_fwd | Convolution | [256,256,14,14] | 3414196 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9331434.67 | 30.20 | 0.45 | 318.50 | true | 0.299317;0.300301;0.300817;0.306697;0.306131 | 5308416;5308416;5308416;5308416;5308416 | 9254272;9348000;9433472;9323008;9323296 | 2359552;2360576;2359552;2359552;2359552 | |
294 | resnetv24_stage3_batchnorm53_fwd | BatchNorm | [256,256,14,14] | 6252 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.67 | 81264640 | 51773813.33 | 55897024.00 | 88.20 | 0.75 | 561.74 | true | 0.881486;0.881872;0.882151;0.882555;0.882484 | 81264640;81264640;81264640;81264640;81264640 | 51779040;51769568;51779360;51771008;51771392 | 55896544;55874432;55922400;55895232;55899296 | |
295 | resnetv24_stage3_activation53 | Activation | [256,256,14,14] | 2751 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51367349.33 | 95.80 | 0.25 | 188.90 | true | 0.958912;0.956488;0.958378;0.957734;0.955754 | 25690112;25690112;25690112;25690112;25690112 | 51367584;51367808;51366656;51366208;51371168 | 51380576;51380576;51380576;51380576;51382880 | |
296 | resnetv24_stage3_conv54_fwd | Convolution | [256,256,14,14] | 1427244 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2080.67 | 26409435136 | 156180949.33 | 152198880.00 | 24.90 | 85.64 | 12692.77 | false | 0.248999;0.249030;0.249048;0.249042;0.249020 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 156386432;154621216;154885856;163568224;157270560 | 153771232;150925920;150976384;154578944;151849024 | |
296 | resnetv24_stage3_conv54_fwd | Convolution | [256,256,14,14] | 1427244 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057928;0.058087;0.057906;0.058082;0.057942 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2432 | |
297 | resnetv24_stage3__plus17 | elemwise_add | [256,1024,14,14] | 13167.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.33 | 51380224 | 3163146.67 | 3774005.33 | 95.60 | 7.41 | 69.40 | true | 0.956218;0.956544;0.956305;0.956656;0.956641 | 51380224;51380224;51380224;51380224;51380224 | 3918688;4375680;1195072;5184640;1009728 | 4542432;4993632;1785952;5797760;1559296 | |
298 | resnetv24_stage3_batchnorm54_fwd | BatchNorm | [256,1024,14,14] | 24771 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 57208042.67 | 62805088.00 | 90.00 | 2.71 | 586.40 | true | 0.900712;0.900312;0.900491;0.900424;0.900555 | 325058560;325058560;325058560;325058560;325058560 | 48569088;61524160;55044448;58289600;58290080 | 52333056;66296384;62796256;62815360;62803648 | |
299 | resnetv24_stage3_activation54 | Activation | [256,1024,14,14] | 11001.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 539.67 | 102760448 | 68509632.00 | 68507573.33 | 98.70 | 0.75 | 190.41 | true | 0.987183;0.987581;0.987410;0.987019;0.987268 | 102760448;102760448;102760448;102760448;102760448 | 73859488;80282016;67437120;64232288;61014432 | 70642336;77068352;70643680;64236704;64223040 | |
300 | resnetv24_stage3_conv55_fwd | Convolution | [256,1024,14,14] | 1478875.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.00 | 26332364800 | 158853578.67 | 37762517.33 | 24.80 | 133.93 | 12563.15 | false | 0.247732;0.248115;0.247317;0.247399;0.247901 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 159183712;157016352;159200768;165595296;158176256 | 37052512;37769888;37799328;37763392;37754272 | |
300 | resnetv24_stage3_conv55_fwd | Convolution | [256,1024,14,14] | 1478875.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057890;0.058312;0.057976;0.058085;0.057867 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2432;2688;2432 | |
301 | resnetv24_stage3_batchnorm55_fwd | BatchNorm | [256,256,14,14] | 6550 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52046154.67 | 55744373.33 | 88.20 | 0.75 | 568.28 | true | 0.882178;0.883127;0.882015;0.881762;0.882729 | 81264640;81264640;81264640;81264640;81264640 | 52060096;52037152;52047008;52047456;52044000 | 55687936;55827392;55724480;55781664;55726976 | |
302 | resnetv24_stage3_activation55 | Activation | [256,256,14,14] | 2765 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51368192.00 | 95.70 | 0.25 | 188.90 | true | 0.955979;0.957769;0.957347;0.957045;0.956629 | 25690112;25690112;25690112;25690112;25690112 | 51370400;51361472;51374432;51365088;51369088 | 51380576;51380576;51380576;51380576;51380576 | |
303 | resnetv24_stage3_conv56_fwd | Convolution | [256,256,14,14] | 3402556 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.67 | 19365101568 | 85203029.33 | 76870933.33 | 24.70 | 119.48 | 12930.18 | false | 0.246926;0.246947;0.247046;0.246545;0.246716 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 77533760;72277888;77560768;75667104;77411936 | 82414720;85001760;89224864;88192608;81389856 | |
303 | resnetv24_stage3_conv56_fwd | Convolution | [256,256,14,14] | 3402556 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.33 | 251658240 | 51387338.67 | 150601205.33 | 47.10 | 1.25 | 910.71 | true | 0.471028;0.471218;0.471000;0.471226;0.471655 | 251658240;251658240;251658240;251658240;251658240 | 51386784;51385376;51388064;51387168;51407328 | 150606016;150595264;150558432;150638944;150602336 | |
303 | resnetv24_stage3_conv56_fwd | Convolution | [256,256,14,14] | 3402556 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.33 | 269484032 | 151323221.33 | 54199381.33 | 48.20 | 1.31 | 1023.36 | true | 0.482222;0.482077;0.481917;0.482390;0.481992 | 269484032;269484032;269484032;269484032;269484032 | 151329120;151301600;151338944;151296032;151343392 | 54194176;54121568;54262944;54141024;54294112 | |
303 | resnetv24_stage3_conv56_fwd | Convolution | [256,256,14,14] | 3402556 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9313514.67 | 30.60 | 0.45 | 325.01 | true | 0.307598;0.306047;0.300350;0.314132;0.303773 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9286528;9308576;9391264;9295264;9336704 | |
304 | resnetv24_stage3_batchnorm56_fwd | BatchNorm | [256,256,14,14] | 6328.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51772160.00 | 55929632.00 | 88.20 | 0.75 | 568.28 | true | 0.881782;0.882277;0.882502;0.881429;0.881815 | 81264640;81264640;81264640;81264640;81264640 | 55930272;55908864;55941856;55949280;55916768 | 51764384;51771136;51776320;51776096;51769248 | |
305 | resnetv24_stage3_activation56 | Activation | [256,256,14,14] | 2769.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51373173.33 | 95.70 | 0.25 | 188.44 | true | 0.956754;0.958044;0.956463;0.956802;0.958021 | 25690112;25690112;25690112;25690112;25690112 | 51380640;51380576;51380576;51380576;51380576 | 51376416;51372256;51381888;51370848;51365216 | |
306 | resnetv24_stage3_conv57_fwd | Convolution | [256,256,14,14] | 1416511.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2080.67 | 26409435136 | 157554528.00 | 152749034.67 | 24.90 | 85.11 | 12692.77 | false | 0.248956;0.248976;0.249010;0.249010;0.248950 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 151430464;153896384;153049728;153766912;150738176 | 156849120;159366208;158842720;156971744;154299968 | |
306 | resnetv24_stage3_conv57_fwd | Convolution | [256,256,14,14] | 1416511.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 3328.00 | 5.80 | 0.00 | 0.00 | true | 0.057883;0.058044;0.057908;0.058083;0.057881 | 0;0;0;0;0 | 15616;2560;2688;2432;4736 | 96;96;96;96;1120 | |
307 | resnetv24_stage3__plus18 | elemwise_add | [256,1024,14,14] | 13396.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.67 | 51380224 | 2172501.33 | 2585312.00 | 95.70 | 10.80 | 69.37 | true | 0.957362;0.957287;0.957123;0.956932;0.957068 | 51380224;51380224;51380224;51380224;51380224 | 836352;128;62112;5827520;5619040 | 1414304;544;179680;6381344;6161952 | |
308 | resnetv24_stage3_batchnorm57_fwd | BatchNorm | [256,1024,14,14] | 25368.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 69082250.67 | 73266112.00 | 90.20 | 2.28 | 586.40 | true | 0.902152;0.901965;0.901770;0.901983;0.901963 | 325058560;325058560;325058560;325058560;325058560 | 74482272;71238016;84202208;58286272;61526464 | 80233184;76757024;90736256;62808128;62797344 | |
309 | resnetv24_stage3_activation57 | Activation | [256,1024,14,14] | 10802.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.33 | 102760448 | 111324213.33 | 111317749.33 | 98.70 | 0.46 | 189.48 | true | 0.987362;0.987228;0.987499;0.987324;0.987429 | 102760448;102760448;102760448;102760448;102760448 | 109179104;122023808;122019680;96332704;102754464 | 109183328;122028448;125239712;96343904;102760864 | |
310 | resnetv24_stage3_conv58_fwd | Convolution | [256,1024,14,14] | 1483515 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.00 | 26332364800 | 161258922.67 | 37807146.67 | 24.80 | 132.28 | 12563.15 | false | 0.248129;0.247412;0.247847;0.247597;0.247713 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 158381280;161879200;163387648;160968288;160929280 | 37754336;37807808;37862496;37824416;37789216 | |
310 | resnetv24_stage3_conv58_fwd | Convolution | [256,1024,14,14] | 1483515 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057919;0.058082;0.057915;0.058053;0.057917 | 0;0;0;0;0 | 96;7264;96;96;96 | 2432;17152;2432;2432;2432 | |
311 | resnetv24_stage3_batchnorm58_fwd | BatchNorm | [256,256,14,14] | 6654 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52057344.00 | 55775285.33 | 88.30 | 0.75 | 568.28 | true | 0.883635;0.883757;0.882893;0.882301;0.882536 | 81264640;81264640;81264640;81264640;81264640 | 55701472;55772000;55788000;55765856;55792224 | 52056704;52061248;52039808;52059712;52055616 | |
312 | resnetv24_stage3_activation58 | Activation | [256,256,14,14] | 2730 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51369824.00 | 95.60 | 0.25 | 188.44 | true | 0.957826;0.955828;0.955871;0.956473;0.956444 | 25690112;25690112;25690112;25690112;25690112 | 51370240;51368416;51371296;51360960;51370816 | 51380608;51380576;51380576;51380576;51380576 | |
313 | resnetv24_stage3_conv59_fwd | Convolution | [256,256,14,14] | 3409032.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.00 | 19365101568 | 88676842.67 | 78153077.33 | 24.70 | 116.08 | 12927.30 | false | 0.246921;0.246972;0.246652;0.246797;0.246821 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 89912608;91443840;87478880;84371392;88639040 | 79408832;78376416;78154624;77928192;76224096 | |
313 | resnetv24_stage3_conv59_fwd | Convolution | [256,256,14,14] | 3409032.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.33 | 251658240 | 51396533.33 | 150577568.00 | 47.10 | 1.25 | 914.01 | true | 0.471162;0.471111;0.470775;0.470669;0.470106 | 251658240;251658240;251658240;251658240;251658240 | 51405664;51406624;51391136;51380576;51392800 | 150627904;150566464;150543584;150597856;150568384 | |
313 | resnetv24_stage3_conv59_fwd | Convolution | [256,256,14,14] | 3409032.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.00 | 269484032 | 151350848.00 | 54266869.33 | 48.20 | 1.31 | 1032.51 | true | 0.482251;0.482945;0.482379;0.482090;0.482127 | 269484032;269484032;269484032;269484032;269484032 | 151352032;151364288;151331392;151347616;151352896 | 54300992;54305568;54203808;54242464;54257152 | |
313 | resnetv24_stage3_conv59_fwd | Convolution | [256,256,14,14] | 3409032.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9360949.33 | 30.40 | 0.45 | 318.50 | true | 0.302819;0.299431;0.308219;0.305052;0.304920 | 5308416;5308416;5308416;5308416;5308416 | 9316256;9366784;9370144;9345920;9377568 | 2359552;2361344;2359552;2359552;2359552 | |
314 | resnetv24_stage3_batchnorm59_fwd | BatchNorm | [256,256,14,14] | 6316.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 145.00 | 81264640 | 51774272.00 | 55898165.33 | 88.20 | 0.75 | 560.45 | true | 0.882838;0.881560;0.882644;0.881116;0.882293 | 81264640;81264640;81264640;81264640;81264640 | 51780032;51774240;51768320;51776736;51771840 | 55894496;55936960;55878432;55890784;55909216 | |
315 | resnetv24_stage3_activation59 | Activation | [256,256,14,14] | 2746.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51368202.67 | 95.80 | 0.25 | 187.98 | true | 0.957806;0.957435;0.958102;0.954886;0.958600 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380576;51380576;51380576 | 51366624;51367424;51370720;51368096;51369088 | |
316 | resnetv24_stage3_conv60_fwd | Convolution | [256,256,14,14] | 1425710.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2078.67 | 26409435136 | 154621248.00 | 153080618.67 | 24.90 | 85.83 | 12704.99 | false | 0.249024;0.248880;0.248953;0.248619;0.248949 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 151614400;157718944;153526848;156449792;153887104 | 153858016;153525856;154116992;151857984;151017280 | |
316 | resnetv24_stage3_conv60_fwd | Convolution | [256,256,14,14] | 1425710.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057921;0.058078;0.057897;0.058084;0.057924 | 0;0;0;0;0 | 2432;2432;2432;9088;2432 | 96;96;96;96;96 | |
317 | resnetv24_stage3__plus19 | elemwise_add | [256,1024,14,14] | 13165.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.00 | 51380224 | 4379936.00 | 4986453.33 | 95.70 | 5.49 | 69.43 | true | 0.956561;0.956760;0.956557;0.956368;0.956472 | 51380224;51380224;51380224;51380224;51380224 | 3756512;4357568;5461888;5025728;2990784 | 4400064;4954240;6070016;5605056;3556160 | |
318 | resnetv24_stage3_batchnorm60_fwd | BatchNorm | [256,1024,14,14] | 24754.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 59312106.67 | 62785749.33 | 90.00 | 2.66 | 585.69 | true | 0.900397;0.900253;0.900672;0.900600;0.900309 | 325058560;325058560;325058560;325058560;325058560 | 54997088;61462560;61476672;77648416;51767488 | 55809376;69755808;62790048;83704096;55811392 | |
319 | resnetv24_stage3_activation60 | Activation | [256,1024,14,14] | 10832.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.33 | 102760448 | 68507285.33 | 66361792.00 | 98.70 | 0.76 | 190.18 | true | 0.987011;0.987146;0.987557;0.987626;0.987381 | 102760448;102760448;102760448;102760448;102760448 | 67436960;67436960;70647936;86704480;67436960 | 64218848;64222272;70641824;86696192;64221280 | |
320 | resnetv24_stage3_conv61_fwd | Convolution | [256,1024,14,14] | 1479174.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2094.33 | 26332364800 | 159348629.33 | 37759904.00 | 24.80 | 133.59 | 12573.15 | false | 0.247651;0.247559;0.247993;0.248228;0.246942 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 37779040;37782944;37662464;37798880;37717728 | 160486752;157380192;160178944;164117184;155061216 | |
320 | resnetv24_stage3_conv61_fwd | Convolution | [256,1024,14,14] | 1479174.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.058036;0.058037;0.057926;0.058192;0.057888 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2688;2432;2688 | |
321 | resnetv24_stage3_batchnorm61_fwd | BatchNorm | [256,256,14,14] | 6238 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52044352.00 | 55748181.33 | 88.30 | 0.75 | 568.28 | true | 0.883643;0.883480;0.881215;0.881616;0.882550 | 81264640;81264640;81264640;81264640;81264640 | 55731712;55741088;55771264;55732192;55786880 | 52053536;52039456;52040064;52056128;52025088 | |
322 | resnetv24_stage3_activation61 | Activation | [256,256,14,14] | 2766.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51373813.33 | 95.70 | 0.25 | 188.90 | true | 0.957349;0.956922;0.955927;0.956965;0.958670 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51373408;51374816;51373216;51377472;51371296 | |
323 | resnetv24_stage3_conv62_fwd | Convolution | [256,256,14,14] | 3404338.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.67 | 19365101568 | 86153376.00 | 77776170.67 | 24.70 | 118.13 | 12930.18 | false | 0.246470;0.246915;0.246657;0.246803;0.246693 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 81861184;87047808;80505888;89551136;89922784 | 78054400;78290528;77250624;78023488;74090080 | |
323 | resnetv24_stage3_conv62_fwd | Convolution | [256,256,14,14] | 3404338.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.67 | 251658240 | 51389194.67 | 150610122.67 | 47.10 | 1.25 | 912.91 | true | 0.471127;0.470403;0.470062;0.470574;0.471099 | 251658240;251658240;251658240;251658240;251658240 | 51375840;51426016;51373856;51383648;51408096 | 150586048;150565056;150672352;150601152;150643168 | |
323 | resnetv24_stage3_conv62_fwd | Convolution | [256,256,14,14] | 3404338.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.00 | 269484032 | 151335968.00 | 54141205.33 | 48.30 | 1.31 | 1024.65 | true | 0.482775;0.482395;0.482701;0.481853;0.482457 | 269484032;269484032;269484032;269484032;269484032 | 54239936;54068032;53967616;54249728;54115648 | 151356480;151333920;151323872;151350112;151317568 | |
323 | resnetv24_stage3_conv62_fwd | Convolution | [256,256,14,14] | 3404338.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9330101.33 | 30.40 | 0.45 | 318.50 | true | 0.306099;0.300220;0.300799;0.304303;0.306979 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9388800;9379584;9276928;9333792;9260672 | |
324 | resnetv24_stage3_batchnorm62_fwd | BatchNorm | [256,256,14,14] | 6260.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51773706.67 | 55945365.33 | 88.20 | 0.75 | 568.28 | true | 0.881525;0.882011;0.882371;0.881529;0.883167 | 81264640;81264640;81264640;81264640;81264640 | 51778720;51779776;51756640;51772544;51769856 | 55900416;55939712;55940608;55955776;55958880 | |
325 | resnetv24_stage3_activation62 | Activation | [256,256,14,14] | 2752.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51368170.67 | 95.60 | 0.25 | 188.44 | true | 0.956906;0.956559;0.955645;0.955389;0.957783 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51378432;51370784;51365280;51368448;51362944 | |
326 | resnetv24_stage3_conv63_fwd | Convolution | [256,256,14,14] | 1417533 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2077.00 | 26409435136 | 156281717.33 | 152462176.00 | 24.90 | 85.54 | 12715.18 | false | 0.248878;0.248981;0.248754;0.248900;0.248985 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 152903104;154770656;157967840;156106656;162153152 | 150274848;154968512;152303008;151841728;153241792 | |
326 | resnetv24_stage3_conv63_fwd | Convolution | [256,256,14,14] | 1417533 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058046;0.058080;0.057928;0.058100;0.057919 | 0;0;0;0;0 | 2176;2688;2432;2688;2432 | 96;96;96;96;96 | |
327 | resnetv24_stage3__plus20 | elemwise_add | [256,1024,14,14] | 13253.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.67 | 51380224 | 3084362.67 | 3679040.00 | 95.70 | 7.60 | 69.37 | true | 0.957016;0.957035;0.956981;0.956913;0.956745 | 51380224;51380224;51380224;51380224;51380224 | 2462208;1688704;128;5679712;5102176 | 3028928;2314592;544;6252896;5693600 | |
328 | resnetv24_stage3_batchnorm63_fwd | BatchNorm | [256,1024,14,14] | 25211 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 52894154.67 | 58151264.00 | 90.20 | 2.93 | 586.40 | true | 0.902508;0.902255;0.902063;0.902016;0.902114 | 325058560;325058560;325058560;325058560;325058560 | 55048640;51823616;51810208;58294560;48574432 | 62799776;55817984;55839904;62795904;55814144 | |
329 | resnetv24_stage3_activation63 | Activation | [256,1024,14,14] | 10775.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.67 | 102760448 | 64225685.33 | 64223050.67 | 98.70 | 0.80 | 189.36 | true | 0.987934;0.987400;0.987498;0.987041;0.986822 | 102760448;102760448;102760448;102760448;102760448 | 61014144;77070752;67436960;61014400;64225696 | 61009984;77065952;67436032;61009280;64223136 | |
330 | resnetv24_stage3_conv64_fwd | Convolution | [256,1024,14,14] | 1481475.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2094.33 | 26332364800 | 157567381.33 | 37789034.67 | 24.80 | 134.79 | 12573.15 | false | 0.247405;0.247771;0.247029;0.247688;0.247512 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 157374144;158356480;153714528;157533280;157794720 | 37790656;37808064;37731264;37798784;37777664 | |
330 | resnetv24_stage3_conv64_fwd | Convolution | [256,1024,14,14] | 1481475.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057955;0.058125;0.057885;0.058082;0.057916 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2432 | |
331 | resnetv24_stage3_batchnorm64_fwd | BatchNorm | [256,256,14,14] | 6508 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52036757.33 | 55830688.00 | 88.30 | 0.75 | 568.28 | true | 0.882330;0.883202;0.883047;0.882494;0.882296 | 81264640;81264640;81264640;81264640;81264640 | 55760544;55859424;55844000;55788640;55860704 | 52059360;52034976;52039360;52035936;52034944 | |
332 | resnetv24_stage3_activation64 | Activation | [256,256,14,14] | 2795.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51371968.00 | 95.80 | 0.25 | 188.90 | true | 0.958821;0.957353;0.958937;0.957394;0.957423 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51374400;51367520;51375072;51373120;51368384 | |
333 | resnetv24_stage3_conv65_fwd | Convolution | [256,256,14,14] | 3412784 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.67 | 19365101568 | 81996373.33 | 77680405.33 | 24.70 | 121.28 | 12921.55 | false | 0.246568;0.247001;0.246462;0.246832;0.247006 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 77679296;77752608;76670176;78334336;77609312 | 84071520;76543552;71097376;89693280;85374048 | |
333 | resnetv24_stage3_conv65_fwd | Convolution | [256,256,14,14] | 3412784 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.00 | 251658240 | 51398752.00 | 150576618.67 | 47.10 | 1.25 | 915.12 | true | 0.471278;0.471112;0.470174;0.471087;0.471488 | 251658240;251658240;251658240;251658240;251658240 | 51405088;51389728;51385760;51409696;51401440 | 150563008;150561344;150612800;150557280;150605504 | |
333 | resnetv24_stage3_conv65_fwd | Convolution | [256,256,14,14] | 3412784 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.33 | 269484032 | 151346677.33 | 54227712.00 | 48.20 | 1.31 | 1031.19 | true | 0.482420;0.482332;0.482618;0.482142;0.482310 | 269484032;269484032;269484032;269484032;269484032 | 151350304;151315616;151367104;151335552;151354176 | 54201632;54325920;54306464;54175040;54173408 | |
333 | resnetv24_stage3_conv65_fwd | Convolution | [256,256,14,14] | 3412784 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9337685.33 | 30.40 | 0.45 | 312.26 | true | 0.304468;0.302208;0.302278;0.307325;0.305346 | 5308416;5308416;5308416;5308416;5308416 | 9352576;9332736;9308416;9357344;9327744 | 2359552;2359552;2359552;2359552;2359552 | |
334 | resnetv24_stage3_batchnorm65_fwd | BatchNorm | [256,256,14,14] | 6354.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 145.00 | 81264640 | 51785674.67 | 55905888.00 | 88.20 | 0.75 | 560.45 | true | 0.882787;0.881717;0.882148;0.881815;0.881018 | 81264640;81264640;81264640;81264640;81264640 | 51781216;51782112;51790624;51791776;51784288 | 55898336;55922464;55896864;55880832;55932896 | |
335 | resnetv24_stage3_activation65 | Activation | [256,256,14,14] | 2758.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380597.33 | 51377898.67 | 95.70 | 0.25 | 188.44 | true | 0.957484;0.957103;0.956348;0.957393;0.956093 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380576;51380608;51380576;51380608 | 51378336;51375104;51381824;51370336;51380256 | |
336 | resnetv24_stage3_conv66_fwd | Convolution | [256,256,14,14] | 1425441 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2085.33 | 26409435136 | 155989333.33 | 152515552.00 | 24.90 | 85.60 | 12664.37 | false | 0.248930;0.248976;0.249010;0.249046;0.248999 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 154382560;151138880;150601600;153747360;152660416 | 161556384;154719328;154097920;155696256;157552416 | |
336 | resnetv24_stage3_conv66_fwd | Convolution | [256,256,14,14] | 1425441 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057902;0.058118;0.057899;0.058044;0.057935 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;1664;2432;2432 | |
337 | resnetv24_stage3__plus21 | elemwise_add | [256,1024,14,14] | 13169.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.33 | 51380224 | 1077845.33 | 1674197.33 | 95.70 | 18.67 | 69.40 | false | 0.956305;0.956538;0.956749;0.956720;0.956784 | 51380224;51380224;51380224;51380224;51380224 | 1217984;950944;1064608;5830496;770208 | 1828864;1529792;1663936;6446528;1384320 | |
338 | resnetv24_stage3_batchnorm66_fwd | BatchNorm | [256,1024,14,14] | 24871 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 73331733.33 | 77904672.00 | 90.00 | 2.15 | 586.40 | true | 0.900352;0.900641;0.900447;0.900481;0.900535 | 325058560;325058560;325058560;325058560;325058560 | 77647744;61467552;207062112;58232768;80879904 | 83710720;66281088;223221312;62783456;83722208 | |
339 | resnetv24_stage3_activation66 | Activation | [256,1024,14,14] | 10925 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 544.33 | 102760448 | 63156448.00 | 64222816.00 | 98.70 | 0.81 | 188.78 | true | 0.987057;0.987019;0.986903;0.987376;0.986402 | 102760448;102760448;102760448;102760448;102760448 | 61018240;67436960;67436672;61014368;61014432 | 61015520;70644192;64222688;64222720;64223040 | |
340 | resnetv24_stage3_conv67_fwd | Convolution | [256,1024,14,14] | 1477551 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.67 | 26332364800 | 156068234.67 | 37745589.33 | 24.70 | 135.86 | 12559.15 | false | 0.247370;0.248234;0.247574;0.247169;0.247422 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 155376096;155817920;153286752;157010688;159475648 | 37816960;37762880;37673760;37724256;37749632 | |
340 | resnetv24_stage3_conv67_fwd | Convolution | [256,1024,14,14] | 1477551 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058464;0.058060;0.057935;0.058098;0.057920 | 0;0;0;0;0 | 2432;2688;2432;2688;2432 | 96;96;96;96;96 | |
341 | resnetv24_stage3_batchnorm67_fwd | BatchNorm | [256,256,14,14] | 6546 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52047338.67 | 55751125.33 | 88.30 | 0.75 | 568.28 | true | 0.882896;0.883438;0.882713;0.882079;0.881659 | 81264640;81264640;81264640;81264640;81264640 | 52047584;52022432;52071872;52049792;52044640 | 55717184;55813056;55669952;55760096;55776096 | |
342 | resnetv24_stage3_activation67 | Activation | [256,256,14,14] | 2743.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51372544.00 | 95.70 | 0.25 | 188.44 | true | 0.957021;0.957176;0.958219;0.957561;0.957271 | 25690112;25690112;25690112;25690112;25690112 | 51389536;51380576;51380576;51380576;51380576 | 51391776;51378016;51370304;51366400;51369312 | |
343 | resnetv24_stage3_conv68_fwd | Convolution | [256,256,14,14] | 3402936.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.33 | 19365101568 | 85273717.33 | 77845962.67 | 24.70 | 118.72 | 12933.06 | false | 0.246599;0.246800;0.246752;0.246543;0.246928 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 88207232;88936416;76854080;84095584;83518336 | 77843008;78832864;77584512;78110368;77116032 | |
343 | resnetv24_stage3_conv68_fwd | Convolution | [256,256,14,14] | 3402936.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.67 | 251658240 | 51396234.67 | 150595232.00 | 47.10 | 1.25 | 912.91 | true | 0.470420;0.471375;0.470636;0.470435;0.471400 | 251658240;251658240;251658240;251658240;251658240 | 51374240;51404832;51392032;51391840;51404832 | 150573888;150607040;150596928;150581728;150610144 | |
343 | resnetv24_stage3_conv68_fwd | Convolution | [256,256,14,14] | 3402936.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.00 | 269484032 | 151331594.67 | 54143733.33 | 48.20 | 1.31 | 1024.65 | true | 0.481880;0.482668;0.482532;0.482613;0.482139 | 269484032;269484032;269484032;269484032;269484032 | 151324736;151322144;151334880;151335168;151340192 | 54082304;54093216;54213632;54124352;54254560 | |
343 | resnetv24_stage3_conv68_fwd | Convolution | [256,256,14,14] | 3402936.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9333674.67 | 30.30 | 0.45 | 318.50 | true | 0.304928;0.303252;0.300273;0.301924;0.310381 | 5308416;5308416;5308416;5308416;5308416 | 9337088;9324800;9333248;9330688;9345664 | 2359552;2359552;2359552;2359552;2359552 | |
344 | resnetv24_stage3_batchnorm68_fwd | BatchNorm | [256,256,14,14] | 6296 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 51733290.67 | 55904490.67 | 88.10 | 0.75 | 566.96 | true | 0.880510;0.881905;0.881403;0.881644;0.880824 | 81264640;81264640;81264640;81264640;81264640 | 51721984;51749504;51738016;51716672;51739872 | 55921248;55910016;55879488;55907264;55896192 | |
345 | resnetv24_stage3_activation68 | Activation | [256,256,14,14] | 2757.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380597.33 | 51366709.33 | 95.60 | 0.25 | 188.90 | true | 0.956753;0.955403;0.955070;0.957031;0.955225 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380608;51382880;51380608 | 51362272;51369184;51363584;51380256;51367360 | |
346 | resnetv24_stage3_conv69_fwd | Convolution | [256,256,14,14] | 1417372 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2083.67 | 26409435136 | 155355424.00 | 151952608.00 | 24.90 | 85.94 | 12674.50 | false | 0.248972;0.249007;0.248962;0.248978;0.248944 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 153559456;157773824;163739040;153849600;154442848 | 150250176;155021888;154795456;150812192;150100352 | |
346 | resnetv24_stage3_conv69_fwd | Convolution | [256,256,14,14] | 1417372 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.057922;0.058063;0.057949;0.058097;0.058558 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;6016;2432;2688 | |
347 | resnetv24_stage3__plus22 | elemwise_add | [256,1024,14,14] | 13292.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 2880672.00 | 3467328.00 | 95.70 | 8.09 | 69.34 | true | 0.957134;0.956994;0.957098;0.956889;0.956770 | 51380224;51380224;51380224;51380224;51380224 | 2434016;4874784;2567232;3609888;2464896 | 3041792;5456288;3163520;4183136;3055328 | |
348 | resnetv24_stage3_batchnorm69_fwd | BatchNorm | [256,1024,14,14] | 25048 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 57210901.33 | 62797056.00 | 90.20 | 2.71 | 585.69 | true | 0.901965;0.901903;0.901715;0.902084;0.901927 | 325058560;325058560;325058560;325058560;325058560 | 48573376;61527520;58288192;51816992;74488832 | 52348160;69778656;62783776;55828736;80251264 | |
349 | resnetv24_stage3_activation69 | Activation | [256,1024,14,14] | 10794 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.33 | 102760448 | 67436938.67 | 66366378.67 | 98.70 | 0.77 | 189.48 | true | 0.987589;0.987621;0.987021;0.987564;0.987251 | 102760448;102760448;102760448;102760448;102760448 | 80281664;64225696;61014144;70648160;67436960 | 77065664;64222272;61016224;70654208;64222656 | |
350 | resnetv24_stage3_conv70_fwd | Convolution | [256,1024,14,14] | 1482600 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.67 | 26332364800 | 158030954.67 | 37809258.67 | 24.80 | 134.46 | 12565.15 | false | 0.247035;0.247627;0.247597;0.247709;0.248157 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 167089152;159071424;155301504;155411936;159609504 | 37810560;37857760;37817408;37787456;37799808 | |
350 | resnetv24_stage3_conv70_fwd | Convolution | [256,1024,14,14] | 1482600 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057900;0.058065;0.057897;0.058084;0.057903 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2432 | |
351 | resnetv24_stage3_batchnorm70_fwd | BatchNorm | [256,256,14,14] | 6257.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52006016.00 | 55753066.67 | 88.30 | 0.75 | 568.28 | true | 0.882503;0.882853;0.882242;0.882935;0.882331 | 81264640;81264640;81264640;81264640;81264640 | 52008000;51988384;51991648;52018400;52026464 | 55716960;55830880;55796448;55745792;55678880 | |
352 | resnetv24_stage3_activation70 | Activation | [256,256,14,14] | 2777.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51368938.67 | 95.70 | 0.25 | 188.90 | true | 0.957482;0.956826;0.956068;0.955327;0.957142 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380576;51380576;51380576 | 51368928;51369952;51365728;51367936;51372992 | |
353 | resnetv24_stage3_conv71_fwd | Convolution | [256,256,14,14] | 3426514.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.33 | 19365101568 | 85276597.33 | 76882517.33 | 24.70 | 119.42 | 12924.43 | false | 0.246949;0.247064;0.246838;0.246948;0.247104 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 84554080;84999104;86276608;72446656;87886816 | 77114880;77202944;77795680;75793568;76329728 | |
353 | resnetv24_stage3_conv71_fwd | Convolution | [256,256,14,14] | 3426514.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.33 | 251658240 | 51408544.00 | 150587690.67 | 47.10 | 1.25 | 914.01 | true | 0.471303;0.470507;0.470602;0.471589;0.471435 | 251658240;251658240;251658240;251658240;251658240 | 51424864;51385952;51417952;51418656;51389024 | 150607168;150588256;150576064;150598752;150575712 | |
353 | resnetv24_stage3_conv71_fwd | Convolution | [256,256,14,14] | 3426514.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.00 | 269484032 | 151338805.33 | 54236586.67 | 48.20 | 1.31 | 1032.51 | true | 0.482145;0.482069;0.482876;0.482468;0.481887 | 269484032;269484032;269484032;269484032;269484032 | 151329248;151348000;151226176;151341664;151345504 | 54379616;54294304;54097376;54227104;54188352 | |
353 | resnetv24_stage3_conv71_fwd | Convolution | [256,256,14,14] | 3426514.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9314346.67 | 30.20 | 0.45 | 325.01 | true | 0.302236;0.299125;0.319283;0.301125;0.302593 | 5308416;5308416;5308416;5308416;5308416 | 9311744;9298944;9332352;9281536;9370400 | 2359552;2359552;2359552;2359552;2359552 | |
354 | resnetv24_stage3_batchnorm71_fwd | BatchNorm | [256,256,14,14] | 6260.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 145.00 | 81264640 | 51777770.67 | 55920949.33 | 88.20 | 0.75 | 560.45 | true | 0.881993;0.881850;0.881712;0.881146;0.881613 | 81264640;81264640;81264640;81264640;81264640 | 55912576;55921184;55947744;55929088;55874848 | 51776384;51772128;51780512;51790912;51776416 | |
355 | resnetv24_stage3_activation71 | Activation | [256,256,14,14] | 2754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51365664.00 | 95.70 | 0.25 | 188.90 | true | 0.957077;0.955925;0.956752;0.955727;0.957606 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380576;51380576;51380576;51380576 | 51363776;51370624;51359296;51373344;51362592 | |
356 | resnetv24_stage3_conv72_fwd | Convolution | [256,256,14,14] | 1426325 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2082.00 | 26409435136 | 154054485.33 | 151216341.33 | 24.90 | 86.51 | 12684.65 | false | 0.248910;0.248960;0.249054;0.248503;0.248947 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 152487872;154893696;154546944;152722816;155338496 | 150744640;151509184;151395200;150525568;152040928 | |
356 | resnetv24_stage3_conv72_fwd | Convolution | [256,256,14,14] | 1426325 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057940;0.058538;0.057890;0.058105;0.057940 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2432 | |
357 | resnetv24_stage3__plus23 | elemwise_add | [256,1024,14,14] | 13205 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 2161824.00 | 2742741.33 | 95.70 | 10.48 | 69.34 | true | 0.956406;0.956151;0.956744;0.956872;0.956475 | 51380224;51380224;51380224;51380224;51380224 | 3221088;1446432;1817952;128;5407200 | 3749888;2058688;2419648;512;6015584 | |
358 | resnetv24_stage3_batchnorm72_fwd | BatchNorm | [256,1024,14,14] | 25095.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 61528320.00 | 66297589.33 | 90.00 | 2.54 | 585.69 | true | 0.900301;0.900152;0.900695;0.900057;0.900631 | 325058560;325058560;325058560;325058560;325058560 | 61526976;61527840;61525408;74477440;61530144 | 66294400;62802624;62805472;76764192;69792896 | |
359 | resnetv24_stage3_activation72 | Activation | [256,1024,14,14] | 10778.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.33 | 102760448 | 82432992.00 | 82425088.00 | 98.70 | 0.62 | 190.18 | true | 0.987806;0.987304;0.987169;0.987185;0.987108 | 102760448;102760448;102760448;102760448;102760448 | 80295616;102757984;141285472;61012800;64221664 | 80304480;102766496;141296480;61014368;64228000 | |
360 | resnetv24_stage3_conv73_fwd | Convolution | [256,1024,14,14] | 1478821.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2100.00 | 26332364800 | 157597493.33 | 37755050.67 | 24.80 | 134.79 | 12539.22 | false | 0.248193;0.247253;0.248081;0.248954;0.247581 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 160191392;167429632;156261024;154093696;156340064 | 37692096;37801824;37835936;37631744;37771232 | |
360 | resnetv24_stage3_conv73_fwd | Convolution | [256,1024,14,14] | 1478821.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.058443;0.058142;0.057912;0.058066;0.057897 | 0;0;0;0;0 | 1888;96;96;96;96 | 6016;2432;2688;2432;2688 | |
361 | resnetv24_stage3_batchnorm73_fwd | BatchNorm | [256,256,14,14] | 7047 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52051680.00 | 55754784.00 | 88.20 | 0.75 | 565.65 | true | 0.883120;0.882525;0.881963;0.881930;0.882412 | 81264640;81264640;81264640;81264640;81264640 | 52055744;52051808;52043712;52052096;52051136 | 55764736;55708352;55773056;55726560;55785568 | |
362 | resnetv24_stage3_activation73 | Activation | [256,256,14,14] | 2806.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 135.33 | 25690112 | 51380576.00 | 51367392.00 | 95.60 | 0.25 | 189.83 | true | 0.957177;0.956311;0.954860;0.954863;0.958166 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380608;51380576;51380576 | 51364992;51388192;51372128;51365056;51363040 | |
363 | resnetv24_stage3_conv74_fwd | Convolution | [256,256,14,14] | 3400965.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.00 | 19365101568 | 83580256.00 | 77371797.33 | 24.70 | 120.32 | 12927.30 | false | 0.246980;0.246716;0.246546;0.246665;0.246915 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 85229120;85954848;80058176;75817760;85453472 | 76710560;74524736;77777920;78200896;77626912 | |
363 | resnetv24_stage3_conv74_fwd | Convolution | [256,256,14,14] | 3400965.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.00 | 251658240 | 51413664.00 | 150583680.00 | 47.10 | 1.25 | 911.81 | true | 0.471184;0.471092;0.470304;0.470510;0.471666 | 251658240;251658240;251658240;251658240;251658240 | 51381472;51421408;51410784;51425888;51408800 | 150633536;150543200;150555872;150652640;150561632 | |
363 | resnetv24_stage3_conv74_fwd | Convolution | [256,256,14,14] | 3400965.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.67 | 269484032 | 151309696.00 | 54207701.33 | 48.20 | 1.31 | 1022.06 | true | 0.481951;0.482547;0.481796;0.481404;0.482327 | 269484032;269484032;269484032;269484032;269484032 | 151349856;151314976;151284288;151329824;151261408 | 54281984;54168832;54123232;54199680;54254592 | |
363 | resnetv24_stage3_conv74_fwd | Convolution | [256,256,14,14] | 3400965.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9331082.67 | 30.20 | 0.45 | 312.26 | true | 0.300774;0.301648;0.314249;0.302005;0.302734 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9284224;9372032;9352960;9276800;9356064 | |
364 | resnetv24_stage3_batchnorm74_fwd | BatchNorm | [256,256,14,14] | 6319.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51773578.67 | 55931114.67 | 88.20 | 0.75 | 568.28 | true | 0.882529;0.882254;0.881614;0.882260;0.882066 | 81264640;81264640;81264640;81264640;81264640 | 51775168;51770304;51775488;51769376;51775264 | 55934944;55912896;55933184;55925216;55937600 | |
365 | resnetv24_stage3_activation74 | Activation | [256,256,14,14] | 2785.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51374549.33 | 95.80 | 0.25 | 188.90 | true | 0.957573;0.957094;0.957508;0.958860;0.957516 | 25690112;25690112;25690112;25690112;25690112 | 51377472;51373344;51378560;51364160;51372832 | 51380576;51380608;51380576;51380576;51380576 | |
366 | resnetv24_stage3_conv75_fwd | Convolution | [256,256,14,14] | 1415956 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2080.00 | 26409435136 | 156160245.33 | 152145600.00 | 24.90 | 85.66 | 12696.84 | false | 0.248851;0.248930;0.248880;0.248981;0.248884 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 154812960;157079424;155048640;166177536;156352672 | 150919072;151959072;153340064;155641024;151137664 | |
366 | resnetv24_stage3_conv75_fwd | Convolution | [256,256,14,14] | 1415956 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2688.00 | 5.80 | 0.00 | 0.00 | true | 0.057905;0.058511;0.057930;0.058053;0.057929 | 0;0;0;0;0 | 2560;2688;2432;2816;12928 | 96;96;96;96;96 | |
367 | resnetv24_stage3__plus24 | elemwise_add | [256,1024,14,14] | 13211.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 5360288.00 | 5951776.00 | 95.70 | 4.54 | 69.34 | true | 0.957237;0.957118;0.956838;0.956769;0.957089 | 51380224;51380224;51380224;51380224;51380224 | 5441344;5063392;3979680;6015936;5576128 | 6018336;5683968;4599584;6556512;6153024 | |
368 | resnetv24_stage3_batchnorm75_fwd | BatchNorm | [256,1024,14,14] | 25037.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 553.00 | 325058560 | 59364128.00 | 65127104.00 | 90.10 | 2.61 | 587.81 | true | 0.900597;0.900558;0.900400;0.900687;0.900264 | 325058560;325058560;325058560;325058560;325058560 | 58291232;61515488;51813888;58285664;74475872 | 62811680;69769184;55830592;62800448;83739584 | |
369 | resnetv24_stage3_activation75 | Activation | [256,1024,14,14] | 10791 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 538.67 | 102760448 | 76000213.33 | 79209002.67 | 98.70 | 0.66 | 190.77 | true | 0.987393;0.987165;0.987059;0.987731;0.987100 | 102760448;102760448;102760448;102760448;102760448 | 77065440;77064064;89909888;67432896;83497504 | 73859136;73859488;89915808;67447200;80282016 | |
370 | resnetv24_stage3_conv76_fwd | Convolution | [256,1024,14,14] | 1481413.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2098.00 | 26332364800 | 156992352.00 | 37767914.67 | 24.70 | 135.20 | 12551.17 | false | 0.247136;0.247986;0.247685;0.247189;0.246850 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 171464832;155541664;155687872;154482080;159747520 | 37861728;37775392;37710464;37754752;37773600 | |
370 | resnetv24_stage3_conv76_fwd | Convolution | [256,1024,14,14] | 1481413.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057947;0.058058;0.057931;0.058051;0.057939 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;1920 | |
371 | resnetv24_stage3_batchnorm76_fwd | BatchNorm | [256,256,14,14] | 6278.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52048480.00 | 55773781.33 | 88.30 | 0.75 | 568.28 | true | 0.882791;0.882909;0.882574;0.882231;0.882552 | 81264640;81264640;81264640;81264640;81264640 | 55777920;55773440;55763520;55769984;55823456 | 52041152;52050272;52059008;52054016;52028032 | |
372 | resnetv24_stage3_activation76 | Activation | [256,256,14,14] | 2752.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51370005.33 | 95.70 | 0.25 | 188.44 | true | 0.957521;0.955657;0.956387;0.957604;0.956805 | 25690112;25690112;25690112;25690112;25690112 | 51367968;51369600;51370400;51370016;51377664 | 51380576;51380576;51380576;51380608;51380576 | |
373 | resnetv24_stage3_conv77_fwd | Convolution | [256,256,14,14] | 3414537.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.33 | 19365101568 | 83566528.00 | 77810474.67 | 24.70 | 120.00 | 12924.43 | false | 0.246832;0.246560;0.246693;0.246676;0.246554 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 84482400;81301600;84375008;86566016;81842176 | 78031392;77819264;77580768;79669632;76546816 | |
373 | resnetv24_stage3_conv77_fwd | Convolution | [256,256,14,14] | 3414537.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.67 | 251658240 | 51405194.67 | 150579381.33 | 47.10 | 1.25 | 909.61 | true | 0.471115;0.470215;0.470515;0.471177;0.470100 | 251658240;251658240;251658240;251658240;251658240 | 150592704;150580704;150580832;150576608;150572640 | 51407392;51398496;51409696;51426016;51390368 | |
373 | resnetv24_stage3_conv77_fwd | Convolution | [256,256,14,14] | 3414537.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.00 | 269484032 | 151340437.33 | 54201472.00 | 48.20 | 1.31 | 1032.51 | true | 0.481634;0.482037;0.483032;0.482342;0.482112 | 269484032;269484032;269484032;269484032;269484032 | 151326752;151276736;151339136;151380864;151355424 | 54183328;54217888;54203200;54307360;54116992 | |
373 | resnetv24_stage3_conv77_fwd | Convolution | [256,256,14,14] | 3414537.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9353045.33 | 30.10 | 0.45 | 312.26 | true | 0.301719;0.299963;0.303793;0.301493;0.300784 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9337600;9359232;9362304;9379232;9305472 | |
374 | resnetv24_stage3_batchnorm77_fwd | BatchNorm | [256,256,14,14] | 6277 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 145.00 | 81264640 | 51785984.00 | 55884394.67 | 88.20 | 0.75 | 560.45 | true | 0.881537;0.882069;0.881514;0.881611;0.881513 | 81264640;81264640;81264640;81264640;81264640 | 55884896;55878400;55889888;55866624;55943744 | 51765600;51784512;51785248;51795136;51788192 | |
375 | resnetv24_stage3_activation77 | Activation | [256,256,14,14] | 2739 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51371765.33 | 95.60 | 0.25 | 188.90 | true | 0.955395;0.956530;0.956434;0.956392;0.956853 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380576;51380576;51380576;51380576 | 51372192;51370976;51372128;51366912;51372960 | |
376 | resnetv24_stage3_conv78_fwd | Convolution | [256,256,14,14] | 1427112.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2077.00 | 26409435136 | 154315349.33 | 151178997.33 | 24.90 | 86.45 | 12715.18 | false | 0.248869;0.248990;0.248970;0.248948;0.248775 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 154137632;159833312;154344192;152926784;154464224 | 152136576;153542848;150817408;149661024;150583008 | |
376 | resnetv24_stage3_conv78_fwd | Convolution | [256,256,14,14] | 1427112.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.058152;0.058092;0.057942;0.058041;0.058216 | 0;0;0;0;0 | 96;96;96;96;352 | 28544;2432;2176;2432;2944 | |
377 | resnetv24_stage3__plus25 | elemwise_add | [256,1024,14,14] | 13242.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.00 | 51380224 | 3278165.33 | 3879648.00 | 95.70 | 7.18 | 69.43 | true | 0.956449;0.956739;0.956625;0.956302;0.956570 | 51380224;51380224;51380224;51380224;51380224 | 3700000;5984;5659840;4670176;3268768 | 3097408;128;5076032;4029088;2708000 | |
378 | resnetv24_stage3_batchnorm78_fwd | BatchNorm | [256,1024,14,14] | 24969.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 58281365.33 | 62810581.33 | 90.00 | 2.68 | 585.69 | true | 0.900055;0.900476;0.900426;0.900484;0.900328 | 325058560;325058560;325058560;325058560;325058560 | 48568960;67995584;55045920;67996480;51802592 | 55838784;73287744;62800448;69792512;55801472 | |
379 | resnetv24_stage3_activation78 | Activation | [256,1024,14,14] | 10828.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 544.00 | 102760448 | 72789045.33 | 72789610.67 | 98.70 | 0.71 | 188.90 | true | 0.986391;0.987393;0.987206;0.987591;0.987787 | 102760448;102760448;102760448;102760448;102760448 | 70648160;77070752;61014368;83493280;70648224 | 70654240;77071840;57802880;83489728;70642752 | |
380 | resnetv24_stage3_conv79_fwd | Convolution | [256,1024,14,14] | 1477007.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2094.67 | 26332364800 | 157973248.00 | 37740832.00 | 24.80 | 134.55 | 12571.15 | false | 0.247235;0.247302;0.247986;0.248076;0.247598 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 159152256;156572448;156794656;164394784;157972832 | 37700224;37735648;37786624;37796352;37696736 | |
380 | resnetv24_stage3_conv79_fwd | Convolution | [256,1024,14,14] | 1477007.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057880;0.058079;0.057900;0.058109;0.057899 | 0;0;0;0;0 | 96;10080;96;96;96 | 2176;24192;2432;2688;2432 | |
381 | resnetv24_stage3_batchnorm79_fwd | BatchNorm | [256,256,14,14] | 6311.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52039829.33 | 55758421.33 | 88.30 | 0.75 | 568.28 | true | 0.882452;0.882953;0.882582;0.882800;0.882763 | 81264640;81264640;81264640;81264640;81264640 | 55723328;55785600;55776096;55706176;55775840 | 52055360;52030688;52023584;52046048;52042752 | |
382 | resnetv24_stage3_activation79 | Activation | [256,256,14,14] | 2731.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51368693.33 | 95.70 | 0.25 | 188.90 | true | 0.958756;0.957470;0.956937;0.954349;0.957181 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51357920;51370144;51370336;51370816;51365600 | |
383 | resnetv24_stage3_conv80_fwd | Convolution | [256,256,14,14] | 3404334.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1499.33 | 19365101568 | 86449888.00 | 77118773.33 | 24.70 | 118.39 | 12915.81 | false | 0.246675;0.246809;0.246982;0.246930;0.246935 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 88718272;82947648;90396000;85034528;85596864 | 77515392;73446784;76193728;77791200;77647200 | |
383 | resnetv24_stage3_conv80_fwd | Convolution | [256,256,14,14] | 3404334.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.00 | 251658240 | 51389834.67 | 150577130.67 | 47.10 | 1.25 | 915.12 | true | 0.470669;0.470848;0.471178;0.470675;0.471024 | 251658240;251658240;251658240;251658240;251658240 | 51373920;51379296;51425312;51397344;51392864 | 150612288;150524512;150580544;150538560;150625248 | |
383 | resnetv24_stage3_conv80_fwd | Convolution | [256,256,14,14] | 3404334.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.67 | 269484032 | 151337152.00 | 54222133.33 | 48.20 | 1.31 | 1022.06 | true | 0.482648;0.483184;0.482148;0.482260;0.482545 | 269484032;269484032;269484032;269484032;269484032 | 54306976;54108096;54110016;54289568;54266816 | 151337376;151336320;151338048;151314272;151337760 | |
383 | resnetv24_stage3_conv80_fwd | Convolution | [256,256,14,14] | 3404334.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9358144.00 | 30.70 | 0.45 | 318.50 | true | 0.300192;0.308767;0.305615;0.308595;0.307166 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2364416;2359552 | 9331104;9408896;9351328;9392000;9287072 | |
384 | resnetv24_stage3_batchnorm80_fwd | BatchNorm | [256,256,14,14] | 6315.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 51775381.33 | 55919552.00 | 88.20 | 0.75 | 566.96 | true | 0.882015;0.881072;0.881644;0.881289;0.882472 | 81264640;81264640;81264640;81264640;81264640 | 51769888;51775168;51788320;51774784;51776192 | 55908160;55904128;55923104;55927392;55936800 | |
385 | resnetv24_stage3_activation80 | Activation | [256,256,14,14] | 2767.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51370560.00 | 95.60 | 0.25 | 188.44 | true | 0.955556;0.957212;0.956077;0.958311;0.955964 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380576;51380576;51380576;51380576 | 51363488;51374944;51369984;51369120;51372576 | |
386 | resnetv24_stage3_conv81_fwd | Convolution | [256,256,14,14] | 1416673.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2083.00 | 26409435136 | 156281802.67 | 152248266.67 | 24.90 | 85.60 | 12678.56 | false | 0.248890;0.249023;0.248948;0.249069;0.248993 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 162319552;154522176;153539808;154753344;159569888 | 156347232;150969792;150753344;152149024;153625984 | |
386 | resnetv24_stage3_conv81_fwd | Convolution | [256,256,14,14] | 1416673.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 6698.67 | 5.80 | 0.00 | 0.00 | true | 0.057913;0.058073;0.057902;0.058213;0.057896 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2688;16768;14720 | |
387 | resnetv24_stage3__plus26 | elemwise_add | [256,1024,14,14] | 13229 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 2781845.33 | 3315669.33 | 95.70 | 8.43 | 69.34 | true | 0.956924;0.956784;0.957018;0.957130;0.956983 | 51380224;51380224;51380224;51380224;51380224 | 128;2309760;5616000;5819456;419776 | 416;2874368;6204576;6429280;868064 | |
388 | resnetv24_stage3_batchnorm81_fwd | BatchNorm | [256,1024,14,14] | 25885.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 74478058.67 | 81409952.00 | 90.20 | 2.09 | 585.69 | true | 0.901917;0.902030;0.901950;0.901761;0.901525 | 325058560;325058560;325058560;325058560;325058560 | 90709920;69787552;97671360;62802240;83732384 | 84190176;64765856;90660000;55055648;74478144 | |
389 | resnetv24_stage3_activation81 | Activation | [256,1024,14,14] | 10806 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.33 | 102760448 | 66367936.00 | 67433770.67 | 98.70 | 0.77 | 190.18 | true | 0.986836;0.987664;0.987794;0.986979;0.987577 | 102760448;102760448;102760448;102760448;102760448 | 64226112;67436960;67440736;61014432;77070752 | 64219680;70648864;67432768;57798880;77065728 | |
390 | resnetv24_stage3_conv82_fwd | Convolution | [256,1024,14,14] | 1483432.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2093.33 | 26332364800 | 155898261.33 | 37773610.67 | 24.80 | 135.96 | 12579.16 | false | 0.247552;0.247334;0.247771;0.248161;0.246959 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 37795072;37722656;37768608;37788608;37763616 | 154874976;156325888;156493920;153555744;157234560 | |
390 | resnetv24_stage3_conv82_fwd | Convolution | [256,1024,14,14] | 1483432.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057923;0.058073;0.057901;0.058041;0.057930 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;10112;2432;2432 | |
391 | resnetv24_stage3_batchnorm82_fwd | BatchNorm | [256,256,14,14] | 6291.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52039754.67 | 55773898.67 | 88.30 | 0.75 | 566.96 | true | 0.881900;0.883365;0.883434;0.882174;0.883546 | 81264640;81264640;81264640;81264640;81264640 | 52024608;52056864;52024864;52048128;52046272 | 55818624;55742880;55823200;55760192;55742048 | |
392 | resnetv24_stage3_activation82 | Activation | [256,256,14,14] | 2754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51370432.00 | 95.60 | 0.25 | 188.44 | true | 0.956245;0.955336;0.955554;0.957243;0.956802 | 25690112;25690112;25690112;25690112;25690112 | 51370368;51358560;51369152;51371776;51388512 | 51380576;51380576;51380576;51380576;51387232 | |
393 | resnetv24_stage3_conv83_fwd | Convolution | [256,256,14,14] | 3411755 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1511.67 | 19365101568 | 83465621.33 | 77544000.00 | 24.70 | 120.27 | 12810.43 | false | 0.246849;0.246714;0.246791;0.246537;0.246856 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 77761344;81967776;79801760;88867232;88627328 | 77605504;77427872;77515456;78026688;77511040 | |
393 | resnetv24_stage3_conv83_fwd | Convolution | [256,256,14,14] | 3411755 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.33 | 251658240 | 51413450.67 | 150599562.67 | 47.10 | 1.25 | 910.71 | true | 0.471158;0.470861;0.471538;0.469926;0.470218 | 251658240;251658240;251658240;251658240;251658240 | 150541248;150576736;150663520;150606432;150615520 | 51417056;51404128;51375648;51419168;51429856 | |
393 | resnetv24_stage3_conv83_fwd | Convolution | [256,256,14,14] | 3411755 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.33 | 269484032 | 151326346.67 | 54262112.00 | 48.20 | 1.31 | 1035.15 | true | 0.482109;0.482428;0.481856;0.482707;0.482211 | 269484032;269484032;269484032;269484032;269484032 | 54180832;54221472;54343840;54221024;54364992 | 151345376;151310208;151320288;151313376;151357504 | |
393 | resnetv24_stage3_conv83_fwd | Convolution | [256,256,14,14] | 3411755 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9324458.67 | 30.60 | 0.45 | 318.50 | true | 0.301832;0.309193;0.303461;0.307413;0.308134 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9386528;9305984;9274112;9338240;9329152 | |
394 | resnetv24_stage3_batchnorm83_fwd | BatchNorm | [256,256,14,14] | 6315.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.33 | 81264640 | 51776064.00 | 55882464.00 | 88.20 | 0.75 | 563.04 | true | 0.882794;0.881314;0.881989;0.882215;0.881437 | 81264640;81264640;81264640;81264640;81264640 | 51772192;51774880;51773312;51800800;51780000 | 55924736;55932640;55843136;55859712;55862944 | |
395 | resnetv24_stage3_activation83 | Activation | [256,256,14,14] | 2754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51367413.33 | 95.70 | 0.25 | 187.98 | true | 0.956249;0.955898;0.955692;0.957426;0.958107 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380576;51380576;51380576 | 51365440;51361824;51363488;51373312;51377344 | |
396 | resnetv24_stage3_conv84_fwd | Convolution | [256,256,14,14] | 1428176.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2083.67 | 26409435136 | 154965034.67 | 151418282.67 | 24.90 | 86.20 | 12674.50 | false | 0.248911;0.248658;0.249045;0.249016;0.249020 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 150195072;152338656;152582336;149898592;151721120 | 152634016;156439936;157116288;153438944;155016224 | |
396 | resnetv24_stage3_conv84_fwd | Convolution | [256,256,14,14] | 1428176.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057953;0.058100;0.057895;0.058036;0.057951 | 0;0;0;0;0 | 96;8032;96;96;96 | 2432;19200;2432;2432;2432 | |
397 | resnetv24_stage3__plus27 | elemwise_add | [256,1024,14,14] | 13158.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.00 | 51380224 | 2161237.33 | 2737877.33 | 95.70 | 10.49 | 69.43 | true | 0.956509;0.956584;0.956613;0.956415;0.956766 | 51380224;51380224;51380224;51380224;51380224 | 4288;5449984;1197248;5740864;1566400 | 128;4860064;637632;5094080;986016 | |
398 | resnetv24_stage3_batchnorm84_fwd | BatchNorm | [256,1024,14,14] | 24747.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 56126464.00 | 62806645.33 | 90.00 | 2.73 | 586.40 | true | 0.900429;0.900462;0.900722;0.900321;0.900119 | 325058560;325058560;325058560;325058560;325058560 | 48571776;71236608;48570880;84187456;48571008 | 52333280;76763104;55823840;90732992;55832992 | |
399 | resnetv24_stage3_activation84 | Activation | [256,1024,14,14] | 10938.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.67 | 102760448 | 66366517.33 | 68503690.67 | 98.70 | 0.76 | 189.36 | true | 0.987170;0.987305;0.986672;0.987789;0.987546 | 102760448;102760448;102760448;102760448;102760448 | 67436896;64225696;61014368;70648224;67436960 | 70647008;64218720;64224224;70643808;70643040 | |
400 | resnetv24_stage3_conv85_fwd | Convolution | [256,1024,14,14] | 1476964 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.67 | 26332364800 | 163404992.00 | 37700256.00 | 24.80 | 130.94 | 12565.15 | false | 0.247255;0.247742;0.247768;0.248205;0.247735 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 159591552;167451744;163719968;154630368;166903456 | 37697792;37749152;37709952;37647136;37693024 | |
400 | resnetv24_stage3_conv85_fwd | Convolution | [256,1024,14,14] | 1476964 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2688.00 | 5.80 | 0.00 | 0.00 | true | 0.058583;0.058056;0.057914;0.058111;0.058061 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;5632;2688;2432;2688 | |
401 | resnetv24_stage3_batchnorm85_fwd | BatchNorm | [256,256,14,14] | 6258.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52045685.33 | 55734997.33 | 88.20 | 0.75 | 568.28 | true | 0.882409;0.882293;0.882595;0.882367;0.881914 | 81264640;81264640;81264640;81264640;81264640 | 52047328;52045792;52043936;52055040;52038368 | 55703520;55781600;55791648;55719872;55689888 | |
402 | resnetv24_stage3_activation85 | Activation | [256,256,14,14] | 2758 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380586.67 | 51373642.67 | 95.70 | 0.25 | 188.90 | true | 0.957047;0.957288;0.956846;0.958014;0.956756 | 25690112;25690112;25690112;25690112;25690112 | 51377472;51370208;51371808;51371648;51387520 | 51380576;51380576;51380608;51380608;51380576 | |
403 | resnetv24_stage3_conv86_fwd | Convolution | [256,256,14,14] | 3403227.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.33 | 19365101568 | 84504362.67 | 75762880.00 | 24.70 | 120.83 | 12924.43 | false | 0.247065;0.246876;0.246830;0.246876;0.246597 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 81910880;83552800;88049408;81549792;89606368 | 77683904;71611552;76783456;77453248;73051936 | |
403 | resnetv24_stage3_conv86_fwd | Convolution | [256,256,14,14] | 3403227.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.00 | 251658240 | 51396128.00 | 150576714.67 | 47.10 | 1.25 | 911.81 | true | 0.471380;0.470730;0.470653;0.470376;0.470768 | 251658240;251658240;251658240;251658240;251658240 | 51377056;51379168;51401632;51407584;51427936 | 150568384;150588768;150643136;150572992;150524000 | |
403 | resnetv24_stage3_conv86_fwd | Convolution | [256,256,14,14] | 3403227.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 264.00 | 269484032 | 151337760.00 | 54210293.33 | 48.30 | 1.31 | 1020.77 | true | 0.482502;0.482828;0.482816;0.482813;0.482651 | 269484032;269484032;269484032;269484032;269484032 | 151343008;151322240;151345824;151337024;151333248 | 54234144;54075200;54280256;54209440;54187296 | |
403 | resnetv24_stage3_conv86_fwd | Convolution | [256,256,14,14] | 3403227.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.00 | 5308416 | 2359552.00 | 9334314.67 | 30.60 | 0.45 | 331.78 | true | 0.304641;0.302130;0.307570;0.310450;0.305501 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9329920;9330432;9288480;9342592;9372160 | |
404 | resnetv24_stage3_batchnorm86_fwd | BatchNorm | [256,256,14,14] | 6299 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 51773322.67 | 55908437.33 | 88.20 | 0.75 | 566.96 | true | 0.881382;0.880013;0.883035;0.882015;0.881822 | 81264640;81264640;81264640;81264640;81264640 | 51767008;51775520;51777440;51761920;51789792 | 55886272;55897408;55905728;55922176;55944608 | |
405 | resnetv24_stage3_activation86 | Activation | [256,256,14,14] | 2751.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51369408.00 | 95.70 | 0.25 | 188.44 | true | 0.957301;0.957313;0.957448;0.956384;0.956657 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380576;51380576;51380576;51380576 | 51371072;51366176;51371968;51361088;51370976 | |
406 | resnetv24_stage3_conv87_fwd | Convolution | [256,256,14,14] | 1417220.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2082.67 | 26409435136 | 154392714.67 | 151469760.00 | 24.90 | 86.34 | 12680.58 | false | 0.248875;0.248885;0.248978;0.248995;0.248993 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 151547104;154266912;152783936;156127296;156573088 | 150126304;150839456;151096032;152473792;152584064 | |
406 | resnetv24_stage3_conv87_fwd | Convolution | [256,256,14,14] | 1417220.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057920;0.058082;0.057976;0.058089;0.058039 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2432;2688;2432 | |
407 | resnetv24_stage3__plus28 | elemwise_add | [256,1024,14,14] | 13216.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 3418826.67 | 4028277.33 | 95.70 | 6.90 | 69.34 | true | 0.957208;0.956763;0.957090;0.956976;0.957303 | 51380224;51380224;51380224;51380224;51380224 | 4251840;2222944;5210976;3781696;640 | 4830656;2841600;5767552;4412576;1088 | |
408 | resnetv24_stage3_batchnorm87_fwd | BatchNorm | [256,1024,14,14] | 25307 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 56127541.33 | 61631178.67 | 90.00 | 2.76 | 586.40 | true | 0.900423;0.900310;0.900564;0.900477;0.900271 | 325058560;325058560;325058560;325058560;325058560 | 58284480;55054624;48576448;55043520;58284736 | 62800192;62801376;55831968;59301472;62791872 | |
409 | resnetv24_stage3_activation87 | Activation | [256,1024,14,14] | 10814 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.00 | 102760448 | 66369920.00 | 65293333.33 | 98.70 | 0.78 | 190.30 | true | 0.986941;0.987388;0.987474;0.986969;0.986923 | 102760448;102760448;102760448;102760448;102760448 | 64232512;73859488;57803104;73859744;61017760 | 64224480;70643008;57802720;73857920;61012512 | |
410 | resnetv24_stage3_conv88_fwd | Convolution | [256,1024,14,14] | 1481491 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.00 | 26332364800 | 158699136.00 | 37772021.33 | 24.70 | 134.03 | 12563.15 | false | 0.247249;0.247584;0.247141;0.247752;0.247524 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 155823744;161260864;163035296;159012800;154676256 | 37793600;37791840;37730624;37843360;37714912 | |
410 | resnetv24_stage3_conv88_fwd | Convolution | [256,1024,14,14] | 1481491 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057926;0.058087;0.057888;0.058078;0.058282 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;9088;2432;2432 | |
411 | resnetv24_stage3_batchnorm88_fwd | BatchNorm | [256,256,14,14] | 6284.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52063989.33 | 55696906.67 | 88.30 | 0.75 | 568.28 | true | 0.882087;0.881937;0.882914;0.882711;0.883294 | 81264640;81264640;81264640;81264640;81264640 | 52047520;52057984;52076064;52068736;52065248 | 55846464;55689824;55708960;55691936;55660128 | |
412 | resnetv24_stage3_activation88 | Activation | [256,256,14,14] | 2836.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380586.67 | 51371445.33 | 95.80 | 0.25 | 188.90 | true | 0.957727;0.957088;0.957768;0.957473;0.958648 | 25690112;25690112;25690112;25690112;25690112 | 51398752;51380576;51380576;51380608;51380576 | 51406400;51372064;51372160;51369664;51370112 | |
413 | resnetv24_stage3_conv89_fwd | Convolution | [256,256,14,14] | 3417365 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.67 | 19365101568 | 85291626.67 | 77361834.67 | 24.70 | 119.06 | 12930.18 | false | 0.246987;0.246928;0.247245;0.246643;0.247056 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 83168704;89147040;85722176;86984000;80334880 | 77388896;77060576;75925824;77806240;77636032 | |
413 | resnetv24_stage3_conv89_fwd | Convolution | [256,256,14,14] | 3417365 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 277.33 | 251658240 | 51394101.33 | 150590069.33 | 47.10 | 1.25 | 907.42 | true | 0.471103;0.470887;0.470879;0.472347;0.470645 | 251658240;251658240;251658240;251658240;251658240 | 51398880;51388064;51371616;51395360;51423008 | 150490688;150574016;150593472;150628192;150602720 | |
413 | resnetv24_stage3_conv89_fwd | Convolution | [256,256,14,14] | 3417365 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.00 | 269484032 | 151348160.00 | 54247114.67 | 48.20 | 1.31 | 1032.51 | true | 0.482624;0.482416;0.482248;0.482262;0.482793 | 269484032;269484032;269484032;269484032;269484032 | 54288160;54275104;54262240;54119520;54204000 | 151356352;151358304;151342624;151290464;151345504 | |
413 | resnetv24_stage3_conv89_fwd | Convolution | [256,256,14,14] | 3417365 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.00 | 5308416 | 2359552.00 | 9337653.33 | 30.50 | 0.45 | 331.78 | true | 0.301460;0.309369;0.305583;0.302365;0.306674 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9396832;9375872;9327648;9309440;9287296 | |
414 | resnetv24_stage3_batchnorm89_fwd | BatchNorm | [256,256,14,14] | 6284.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 145.00 | 81264640 | 51788160.00 | 55905162.67 | 88.20 | 0.75 | 560.45 | true | 0.882048;0.882001;0.882555;0.882188;0.881688 | 81264640;81264640;81264640;81264640;81264640 | 55940320;55895744;55841888;55910528;55909216 | 51788000;51796032;51780608;51779232;51795872 | |
415 | resnetv24_stage3_activation89 | Activation | [256,256,14,14] | 2755.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51370112.00 | 95.70 | 0.25 | 188.90 | true | 0.956339;0.957919;0.956329;0.957526;0.957641 | 25690112;25690112;25690112;25690112;25690112 | 51362912;51367584;51373312;51373920;51369440 | 51380576;51380576;51380576;51380576;51380576 | |
416 | resnetv24_stage3_conv90_fwd | Convolution | [256,256,14,14] | 1427502.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2082.00 | 26409435136 | 156413280.00 | 152042858.67 | 24.90 | 85.62 | 12684.65 | false | 0.249010;0.249003;0.248987;0.249022;0.248996 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 161456736;159014624;153122944;153963584;156261632 | 154169056;152730016;151428768;150205440;151969792 | |
416 | resnetv24_stage3_conv90_fwd | Convolution | [256,256,14,14] | 1427502.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058157;0.058075;0.057910;0.058075;0.058070 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;13056;2432 | |
417 | resnetv24_stage3__plus29 | elemwise_add | [256,1024,14,14] | 13325.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.00 | 51380224 | 1691936.00 | 2099061.33 | 95.60 | 13.55 | 69.43 | true | 0.956636;0.956636;0.956305;0.956206;0.956551 | 51380224;51380224;51380224;51380224;51380224 | 1513952;128;6120864;3561728;128 | 2092800;544;6727936;4203776;608 | |
418 | resnetv24_stage3_batchnorm90_fwd | BatchNorm | [256,1024,14,14] | 25241 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 63686720.00 | 67458474.67 | 90.10 | 2.48 | 586.40 | true | 0.900280;0.900657;0.900681;0.900416;0.900739 | 325058560;325058560;325058560;325058560;325058560 | 58285760;77720768;51799488;74487424;58286976 | 62789056;83751200;55824224;76775808;62810560 | |
419 | resnetv24_stage3_activation90 | Activation | [256,1024,14,14] | 10823.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.67 | 102760448 | 67437504.00 | 67434442.67 | 98.70 | 0.76 | 189.36 | true | 0.986994;0.987588;0.987701;0.987160;0.987483 | 102760448;102760448;102760448;102760448;102760448 | 61016128;86704544;61014368;80282016;57803168 | 64224352;89907008;61014240;77064736;57802016 | |
420 | resnetv24_stage3_conv91_fwd | Convolution | [256,1024,14,14] | 1479160 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.00 | 26332364800 | 156874773.33 | 37771050.67 | 24.70 | 135.28 | 12563.15 | false | 0.247049;0.247954;0.247202;0.247476;0.247664 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 155394816;160623328;156501248;158267616;155855456 | 37781344;37803840;37752544;37654592;37779264 | |
420 | resnetv24_stage3_conv91_fwd | Convolution | [256,1024,14,14] | 1479160 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2560.00 | 5.80 | 0.00 | 0.00 | true | 0.057889;0.058073;0.058236;0.058082;0.058051 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2560;2688;2432 | |
421 | resnetv24_stage3_batchnorm91_fwd | BatchNorm | [256,256,14,14] | 6474 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52043594.67 | 55758698.67 | 88.20 | 0.75 | 568.28 | true | 0.881897;0.883316;0.882495;0.882414;0.882509 | 81264640;81264640;81264640;81264640;81264640 | 52053312;52031456;52043904;52052800;52034080 | 55768800;55767104;55740192;55728288;55772160 | |
422 | resnetv24_stage3_activation91 | Activation | [256,256,14,14] | 2763.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380586.67 | 51367104.00 | 95.70 | 0.25 | 188.90 | true | 0.957002;0.957536;0.957196;0.957108;0.957549 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380576;51380576;51380608 | 51367360;51369632;51364928;51367776;51366176 | |
423 | resnetv24_stage3_conv92_fwd | Convolution | [256,256,14,14] | 3403011.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.00 | 19365101568 | 82911872.00 | 77522080.00 | 24.70 | 120.70 | 12927.30 | false | 0.247030;0.246615;0.247021;0.246989;0.246539 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 81196896;88301184;88366720;78878880;79237536 | 78136128;75424256;77477728;77350784;77737728 | |
423 | resnetv24_stage3_conv92_fwd | Convolution | [256,256,14,14] | 3403011.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.67 | 251658240 | 51400010.67 | 150578208.00 | 47.10 | 1.25 | 912.91 | true | 0.469986;0.471317;0.470874;0.470328;0.470326 | 251658240;251658240;251658240;251658240;251658240 | 51375328;51405536;51394784;51404832;51400416 | 150581568;150573408;150579648;150562624;150664160 | |
423 | resnetv24_stage3_conv92_fwd | Convolution | [256,256,14,14] | 3403011.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.67 | 269484032 | 151340085.33 | 54222741.33 | 48.30 | 1.31 | 1022.06 | true | 0.482602;0.482745;0.482723;0.482194;0.481994 | 269484032;269484032;269484032;269484032;269484032 | 54187360;54280064;54141600;54207968;54272896 | 151309472;151334304;151335744;151350208;151355968 | |
423 | resnetv24_stage3_conv92_fwd | Convolution | [256,256,14,14] | 3403011.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9337568.00 | 30.30 | 0.45 | 318.50 | true | 0.305839;0.302288;0.304562;0.303221;0.301318 | 5308416;5308416;5308416;5308416;5308416 | 9329536;9353344;9333120;9350048;9242144 | 2359552;2359552;2359552;2360576;2359552 | |
424 | resnetv24_stage3_batchnorm92_fwd | BatchNorm | [256,256,14,14] | 6313.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51776469.33 | 55931584.00 | 88.20 | 0.75 | 568.28 | true | 0.881806;0.882024;0.881587;0.881401;0.881538 | 81264640;81264640;81264640;81264640;81264640 | 51766432;51778880;51783776;51767872;51782656 | 55944480;55921088;55951456;55929184;55889792 | |
425 | resnetv24_stage3_activation92 | Activation | [256,256,14,14] | 2806.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380586.67 | 51371904.00 | 95.70 | 0.25 | 188.44 | true | 0.957210;0.957067;0.957069;0.956824;0.957893 | 25690112;25690112;25690112;25690112;25690112 | 51372352;51373600;51370016;51361728;51373344 | 51380576;51380608;51380576;51380608;51380576 | |
426 | resnetv24_stage3_conv93_fwd | Convolution | [256,256,14,14] | 1416648.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2082.67 | 26409435136 | 153378581.33 | 152063829.33 | 24.90 | 86.46 | 12680.58 | false | 0.248997;0.248998;0.248932;0.248997;0.249033 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 150678592;152198560;152706240;151286688;152877088 | 152593376;152477568;155064800;156947488;151764192 | |
426 | resnetv24_stage3_conv93_fwd | Convolution | [256,256,14,14] | 1416648.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2688.00 | 5.80 | 0.00 | 0.00 | true | 0.057889;0.058090;0.057908;0.058126;0.058068 | 0;0;0;0;0 | 96;96;96;96;6752 | 2688;2560;2816;2432;16640 | |
427 | resnetv24_stage3__plus30 | elemwise_add | [256,1024,14,14] | 13333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 4178464.00 | 4763968.00 | 95.70 | 5.75 | 69.34 | true | 0.957001;0.956818;0.957112;0.956997;0.956957 | 51380224;51380224;51380224;51380224;51380224 | 3781120;3177408;5576864;6082336;2527168 | 4360512;3783648;6147744;6646592;3131648 | |
428 | resnetv24_stage3_batchnorm93_fwd | BatchNorm | [256,1024,14,14] | 25341.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 53978240.00 | 58149621.33 | 90.00 | 2.90 | 586.40 | true | 0.900380;0.900395;0.900669;0.900444;0.899964 | 325058560;325058560;325058560;325058560;325058560 | 51817120;58287776;51829824;61520832;51809376 | 55820768;62808544;55837696;62790400;55820000 | |
429 | resnetv24_stage3_activation93 | Activation | [256,1024,14,14] | 10772.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.67 | 102760448 | 66366400.00 | 66367669.33 | 98.80 | 0.77 | 189.36 | true | 0.987673;0.987581;0.986929;0.987580;0.987841 | 102760448;102760448;102760448;102760448;102760448 | 67436608;73862048;64225632;64225696;67436896 | 67446816;77066944;64220384;64221184;67435008 | |
430 | resnetv24_stage3_conv94_fwd | Convolution | [256,1024,14,14] | 1482440.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2097.00 | 26332364800 | 158012512.00 | 37756725.33 | 24.70 | 134.51 | 12557.16 | false | 0.246891;0.247802;0.247502;0.247364;0.247068 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 163631168;160535008;155333600;156296192;157206336 | 37714944;37713664;37816640;37738592;37897440 | |
430 | resnetv24_stage3_conv94_fwd | Convolution | [256,1024,14,14] | 1482440.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057955;0.058085;0.057904;0.058097;0.058310 | 0;0;0;0;0 | 3456;2432;2432;2432;2432 | 608;96;96;96;96 | |
431 | resnetv24_stage3_batchnorm94_fwd | BatchNorm | [256,256,14,14] | 6362.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52051690.67 | 55772576.00 | 88.20 | 0.75 | 566.96 | true | 0.883090;0.882293;0.882402;0.882216;0.882153 | 81264640;81264640;81264640;81264640;81264640 | 52060384;52058304;52076928;52035232;52036384 | 55757056;55756544;55710848;55835648;55804128 | |
432 | resnetv24_stage3_activation94 | Activation | [256,256,14,14] | 2747 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380608.00 | 51370709.33 | 95.70 | 0.25 | 188.90 | true | 0.957337;0.957708;0.956918;0.956131;0.956202 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380832;51380576;51380608;51380608 | 51369056;51361504;51377056;51372064;51371008 | |
433 | resnetv24_stage3_conv95_fwd | Convolution | [256,256,14,14] | 3414107 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1499.00 | 19365101568 | 85761888.00 | 77349610.67 | 24.70 | 118.72 | 12918.68 | false | 0.246844;0.246935;0.246896;0.246858;0.246918 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 81692896;75264544;88597344;89744672;86995424 | 75438976;77063168;77492512;79038528;77493152 | |
433 | resnetv24_stage3_conv95_fwd | Convolution | [256,256,14,14] | 3414107 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 277.00 | 251658240 | 51397493.33 | 150605930.67 | 47.10 | 1.25 | 908.51 | true | 0.470556;0.470964;0.471310;0.471181;0.472112 | 251658240;251658240;251658240;251658240;251658240 | 51406112;51379552;51397280;51421472;51389088 | 150586560;150625344;150586208;150605888;150665024 | |
433 | resnetv24_stage3_conv95_fwd | Convolution | [256,256,14,14] | 3414107 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.00 | 269484032 | 151356832.00 | 54236501.33 | 48.20 | 1.31 | 1032.51 | true | 0.481926;0.482156;0.482766;0.483092;0.481889 | 269484032;269484032;269484032;269484032;269484032 | 151364320;151346624;151359552;151376192;151338496 | 54263392;54239904;54206208;54300288;54158976 | |
433 | resnetv24_stage3_conv95_fwd | Convolution | [256,256,14,14] | 3414107 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.00 | 5308416 | 2359552.00 | 9314858.67 | 30.60 | 0.45 | 331.78 | true | 0.303272;0.308990;0.305887;0.309192;0.301995 | 5308416;5308416;5308416;5308416;5308416 | 2361600;2359552;2359552;2359552;2359552 | 9379328;9290496;9352832;9301248;9237248 | |
434 | resnetv24_stage3_batchnorm95_fwd | BatchNorm | [256,256,14,14] | 6331.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 145.00 | 81264640 | 51775488.00 | 55911274.67 | 88.20 | 0.75 | 560.45 | true | 0.882122;0.881512;0.881951;0.881872;0.881893 | 81264640;81264640;81264640;81264640;81264640 | 51773408;51779200;51773248;51780224;51773856 | 55900480;55931232;55910016;55923328;55896800 | |
435 | resnetv24_stage3_activation95 | Activation | [256,256,14,14] | 2741.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51369066.67 | 95.70 | 0.25 | 188.90 | true | 0.956796;0.956779;0.957482;0.956756;0.958464 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380608 | 51372736;51370848;51370336;51362976;51366016 | |
436 | resnetv24_stage3_conv96_fwd | Convolution | [256,256,14,14] | 1423925.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2078.67 | 26409435136 | 155722848.00 | 151658517.33 | 24.90 | 85.92 | 12704.99 | false | 0.248927;0.248937;0.249000;0.248873;0.249018 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 155431328;157219840;154517376;163532512;154119648 | 151619264;153358848;151576544;151779744;150496384 | |
436 | resnetv24_stage3_conv96_fwd | Convolution | [256,256,14,14] | 1423925.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 3626.67 | 5.80 | 0.00 | 0.00 | true | 0.057935;0.058109;0.057885;0.058119;0.058065 | 0;0;0;0;0 | 2144;96;96;96;96 | 6528;2432;2432;5760;2688 | |
437 | resnetv24_stage3__plus31 | elemwise_add | [256,1024,14,14] | 13215 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.00 | 51380224 | 3269109.33 | 3848064.00 | 95.60 | 7.22 | 69.43 | true | 0.956718;0.956354;0.956408;0.956354;0.956503 | 51380224;51380224;51380224;51380224;51380224 | 6106912;160;4742208;2851552;2213568 | 6715808;512;5335552;3444864;2763776 | |
438 | resnetv24_stage3_batchnorm96_fwd | BatchNorm | [256,1024,14,14] | 24881.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 68005408.00 | 73273994.67 | 90.00 | 2.30 | 585.69 | true | 0.900184;0.900782;0.900339;0.900170;0.900693 | 325058560;325058560;325058560;325058560;325058560 | 58287136;77719104;68009984;207233760;58279744 | 62804832;83753152;73264000;223306912;62800128 | |
439 | resnetv24_stage3_activation96 | Activation | [256,1024,14,14] | 10791.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.00 | 102760448 | 68507360.00 | 70645856.00 | 98.80 | 0.74 | 189.59 | true | 0.987767;0.987285;0.987723;0.987421;0.987445 | 102760448;102760448;102760448;102760448;102760448 | 61014080;67436960;73859424;93134528;64225696 | 57802624;70647264;77070080;89913280;64220224 | |
440 | resnetv24_stage3_conv97_fwd | Convolution | [256,1024,14,14] | 1475572.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2099.33 | 26332364800 | 159244480.00 | 37770208.00 | 24.70 | 133.66 | 12543.21 | false | 0.247283;0.247513;0.247322;0.247992;0.247520 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 157877408;154662112;160327552;163143264;159528480 | 37782016;37813760;37744544;37670368;37784064 | |
440 | resnetv24_stage3_conv97_fwd | Convolution | [256,1024,14,14] | 1475572.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.057872;0.058094;0.057951;0.058072;0.058478 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2688;2432;2688 | |
441 | resnetv24_stage3_batchnorm97_fwd | BatchNorm | [256,256,14,14] | 6257 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52030357.33 | 55793738.67 | 88.20 | 0.75 | 568.28 | true | 0.882062;0.882443;0.881943;0.882130;0.881612 | 81264640;81264640;81264640;81264640;81264640 | 52013856;52036256;52045984;52029056;52025760 | 55845056;55729024;55778336;55824672;55778208 | |
442 | resnetv24_stage3_activation97 | Activation | [256,256,14,14] | 2732.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380586.67 | 51370688.00 | 95.80 | 0.25 | 187.98 | true | 0.956658;0.957656;0.957634;0.957784;0.958136 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380576;51380576;51380608;51380576 | 51375360;51368288;51379968;51359392;51368416 | |
443 | resnetv24_stage3_conv98_fwd | Convolution | [256,256,14,14] | 3399561.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.00 | 19365101568 | 83932874.67 | 77201706.67 | 24.70 | 120.18 | 12927.30 | false | 0.246846;0.246721;0.246684;0.247010;0.246907 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 83820928;77048064;84770112;85416160;83207584 | 76988128;77105120;77511872;77603104;76150464 | |
443 | resnetv24_stage3_conv98_fwd | Convolution | [256,256,14,14] | 3399561.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.67 | 251658240 | 51415242.67 | 150603605.33 | 47.10 | 1.25 | 909.61 | true | 0.471029;0.470564;0.470469;0.470853;0.471456 | 251658240;251658240;251658240;251658240;251658240 | 51419808;51377120;51448224;51433760;51392160 | 150596192;150615648;150598976;150630496;150529120 | |
443 | resnetv24_stage3_conv98_fwd | Convolution | [256,256,14,14] | 3399561.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 263.33 | 269484032 | 151333056.00 | 54240778.67 | 48.20 | 1.31 | 1023.36 | true | 0.482511;0.482045;0.482842;0.482250;0.482325 | 269484032;269484032;269484032;269484032;269484032 | 151336096;151339104;151351296;151310560;151323968 | 54265568;54208352;54248416;54158656;54298816 | |
443 | resnetv24_stage3_conv98_fwd | Convolution | [256,256,14,14] | 3399561.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9310549.33 | 30.90 | 0.45 | 318.50 | true | 0.304716;0.307492;0.312075;0.312443;0.307241 | 5308416;5308416;5308416;5308416;5308416 | 9323904;9303424;9298464;9304320;9398656 | 2359552;2359552;2359552;2359552;2359552 | |
444 | resnetv24_stage3_batchnorm98_fwd | BatchNorm | [256,256,14,14] | 6330.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51768522.67 | 55923306.67 | 88.20 | 0.75 | 568.28 | true | 0.882106;0.881660;0.882693;0.881896;0.882070 | 81264640;81264640;81264640;81264640;81264640 | 51787744;51772384;51754176;51750592;51779008 | 55917280;55950336;55934368;55903072;55918272 | |
445 | resnetv24_stage3_activation98 | Activation | [256,256,14,14] | 2775 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 137.00 | 25690112 | 51380576.00 | 51372128.00 | 95.70 | 0.25 | 187.52 | true | 0.958002;0.957009;0.957396;0.959106;0.956902 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380576;51380576;51380576 | 51371776;51374400;51385376;51370208;51364928 | |
446 | resnetv24_stage3_conv99_fwd | Convolution | [256,256,14,14] | 1415085 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2079.33 | 26409435136 | 155891445.33 | 152763242.67 | 24.90 | 85.56 | 12700.92 | false | 0.249030;0.248751;0.249051;0.248856;0.248966 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 156971360;159736608;154277440;153476224;156425536 | 152227456;154186720;150049920;155187552;151875552 | |
446 | resnetv24_stage3_conv99_fwd | Convolution | [256,256,14,14] | 1415085 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057885;0.058049;0.057930;0.058085;0.058561 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2176;2688;2432 | |
447 | resnetv24_stage3__plus32 | elemwise_add | [256,1024,14,14] | 13386 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 3982410.67 | 4576896.00 | 95.70 | 6.00 | 69.34 | true | 0.956974;0.957244;0.956979;0.957166;0.956942 | 51380224;51380224;51380224;51380224;51380224 | 344384;2726976;5699264;3560320;5659936 | 699232;3331328;6313440;4139296;6260064 | |
448 | resnetv24_stage3_batchnorm99_fwd | BatchNorm | [256,1024,14,14] | 25192.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 59333856.00 | 62787669.33 | 90.00 | 2.66 | 585.69 | true | 0.899970;0.900329;0.900157;0.900516;0.900220 | 325058560;325058560;325058560;325058560;325058560 | 62792544;69762208;55819424;62782080;62788384 | 58259584;64730112;48541184;61486272;58255712 | |
449 | resnetv24_stage3_activation99 | Activation | [256,1024,14,14] | 10790.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.67 | 102760448 | 67440234.67 | 68504554.67 | 98.70 | 0.76 | 189.71 | true | 0.987043;0.987861;0.987176;0.987441;0.987140 | 102760448;102760448;102760448;102760448;102760448 | 70647872;67447200;64225632;61014432;93127008 | 70646656;70645504;64221504;57803328;89911232 | |
450 | resnetv24_stage3_conv100_fwd | Convolution | [256,1024,14,14] | 1481520.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2092.67 | 26332364800 | 156251210.67 | 37802784.00 | 24.70 | 135.70 | 12583.16 | false | 0.247371;0.247905;0.247426;0.247126;0.247008 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 37720576;37828832;37812064;37804288;37792000 | 168157600;156609120;156149248;155015456;155995264 | |
450 | resnetv24_stage3_conv100_fwd | Convolution | [256,1024,14,14] | 1481520.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058514;0.058114;0.057938;0.058056;0.058110 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2432 | |
451 | resnetv24_stage3_batchnorm100_fwd | BatchNorm | [256,256,14,14] | 6262 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52044170.67 | 55753397.33 | 88.20 | 0.75 | 568.28 | true | 0.882760;0.881876;0.882169;0.881681;0.881943 | 81264640;81264640;81264640;81264640;81264640 | 52022464;52044896;52046784;52045696;52041920 | 55815872;55734784;55763072;55762336;55734144 | |
452 | resnetv24_stage3_activation100 | Activation | [256,256,14,14] | 2734 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51371370.67 | 95.80 | 0.25 | 188.44 | true | 0.957531;0.957909;0.957192;0.958131;0.956280 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380608 | 51357568;51370368;51381024;51374560;51369184 | |
453 | resnetv24_stage3_conv101_fwd | Convolution | [256,256,14,14] | 3413073.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.33 | 19365101568 | 87148554.67 | 77954197.33 | 24.70 | 117.29 | 12924.43 | false | 0.246969;0.246611;0.246589;0.246882;0.246767 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 77559232;77996640;78123680;78191456;77742272 | 87498752;87931776;90134720;86015136;85893856 | |
453 | resnetv24_stage3_conv101_fwd | Convolution | [256,256,14,14] | 3413073.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.67 | 251658240 | 51396682.67 | 150607776.00 | 47.10 | 1.25 | 912.91 | true | 0.471645;0.471165;0.471378;0.470548;0.471304 | 251658240;251658240;251658240;251658240;251658240 | 51379232;51406624;51405408;51394400;51390240 | 150581568;150624224;150617536;150651456;150574656 | |
453 | resnetv24_stage3_conv101_fwd | Convolution | [256,256,14,14] | 3413073.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.67 | 269484032 | 151350069.33 | 54297376.00 | 48.20 | 1.31 | 1033.82 | true | 0.482420;0.481998;0.482263;0.482038;0.482204 | 269484032;269484032;269484032;269484032;269484032 | 151370304;151332896;151347232;151343136;151359840 | 54330176;54252960;54308992;54219616;54362784 | |
453 | resnetv24_stage3_conv101_fwd | Convolution | [256,256,14,14] | 3413073.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.00 | 5308416 | 2359552.00 | 9309952.00 | 30.90 | 0.45 | 331.78 | true | 0.311815;0.318162;0.313285;0.302352;0.300130 | 5308416;5308416;5308416;5308416;5308416 | 9348864;9285504;9295488;9278976;9364736 | 2359552;2359552;2359552;2359552;2359552 | |
454 | resnetv24_stage3_batchnorm101_fwd | BatchNorm | [256,256,14,14] | 6292.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.33 | 81264640 | 51776480.00 | 55905781.33 | 88.20 | 0.75 | 563.04 | true | 0.881734;0.882045;0.882042;0.881842;0.881263 | 81264640;81264640;81264640;81264640;81264640 | 51777600;51774752;51784288;51767616;51777088 | 55874688;55924320;55879680;55942624;55913344 | |
455 | resnetv24_stage3_activation101 | Activation | [256,256,14,14] | 2765.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51369888.00 | 95.70 | 0.25 | 188.44 | true | 0.957204;0.956692;0.955413;0.959073;0.956738 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380576;51380576;51380576;51380576 | 51359328;51375680;51371264;51364512;51373888 | |
456 | resnetv24_stage3_conv102_fwd | Convolution | [256,256,14,14] | 1426076 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2073.67 | 26409435136 | 156866240.00 | 152611946.67 | 24.90 | 85.34 | 12735.62 | false | 0.249009;0.248886;0.248999;0.248875;0.248973 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 153088224;160510208;156067264;156968480;157562976 | 150283072;154411744;154071008;151621568;152143264 | |
456 | resnetv24_stage3_conv102_fwd | Convolution | [256,256,14,14] | 1426076 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2474.67 | 5.80 | 0.00 | 0.00 | true | 0.057942;0.058102;0.057920;0.058084;0.058083 | 0;0;0;0;0 | 96;96;96;6752;96 | 2432;2432;2560;16640;2432 | |
457 | resnetv24_stage3__plus33 | elemwise_add | [256,1024,14,14] | 13191.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 1361034.67 | 1560885.33 | 95.70 | 17.58 | 69.34 | false | 0.956743;0.956543;0.956354;0.956593;0.956758 | 51380224;51380224;51380224;51380224;51380224 | 4082336;640;6079904;128;128 | 4679424;672;6724224;2336;896 | |
458 | resnetv24_stage3_batchnorm102_fwd | BatchNorm | [256,1024,14,14] | 24852.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 52893386.67 | 58156320.00 | 90.20 | 2.93 | 586.40 | true | 0.902161;0.902036;0.902127;0.902184;0.902037 | 325058560;325058560;325058560;325058560;325058560 | 48569088;51810464;55048096;55051136;51821600 | 55812608;55828064;62811104;62807360;55833536 | |
459 | resnetv24_stage3_activation102 | Activation | [256,1024,14,14] | 10761 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.00 | 102760448 | 68507360.00 | 69573472.00 | 98.70 | 0.74 | 189.59 | true | 0.987078;0.987191;0.986950;0.987547;0.987242 | 102760448;102760448;102760448;102760448;102760448 | 67436960;77070752;61014144;61014432;77070688 | 67435456;77061376;61011616;64223584;77063552 | |
460 | resnetv24_stage3_conv103_fwd | Convolution | [256,1024,14,14] | 1479606.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.67 | 26332364800 | 166950261.33 | 37617728.00 | 24.80 | 128.72 | 12565.15 | false | 0.247963;0.247512;0.246524;0.247761;0.247495 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 173184672;169098144;165717632;164544384;166035008 | 37467296;37305056;37704672;37681216;37756992 | |
460 | resnetv24_stage3_conv103_fwd | Convolution | [256,1024,14,14] | 1479606.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057914;0.058717;0.057918;0.058102;0.058037 | 0;0;0;0;0 | 2432;2688;2432;2688;2432 | 96;96;96;96;96 | |
461 | resnetv24_stage3_batchnorm103_fwd | BatchNorm | [256,256,14,14] | 6286.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 52054986.67 | 55711744.00 | 88.30 | 0.75 | 564.34 | true | 0.882702;0.882365;0.883009;0.882453;0.882465 | 81264640;81264640;81264640;81264640;81264640 | 52056960;52046368;52061728;52053728;52054272 | 55734368;55773280;55675872;55629504;55724992 | |
462 | resnetv24_stage3_activation103 | Activation | [256,256,14,14] | 2743.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380586.67 | 51367146.67 | 95.70 | 0.25 | 188.90 | true | 0.958703;0.957571;0.957498;0.957122;0.955439 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380608;51380576;51380608 | 51356064;51364832;51364096;51372512;51373600 | |
463 | resnetv24_stage3_conv104_fwd | Convolution | [256,256,14,14] | 3404091.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1512.67 | 19365101568 | 82508533.33 | 77351882.67 | 24.70 | 121.14 | 12801.96 | false | 0.246586;0.246799;0.247275;0.246768;0.246781 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 81358752;86415776;80983072;85183776;80353600 | 76498624;77469376;77061312;77524960;77853472 | |
463 | resnetv24_stage3_conv104_fwd | Convolution | [256,256,14,14] | 3404091.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.00 | 251658240 | 51378293.33 | 150587264.00 | 47.10 | 1.25 | 911.81 | true | 0.471182;0.471077;0.470821;0.470730;0.471779 | 251658240;251658240;251658240;251658240;251658240 | 51378400;51388576;51352672;51420320;51367904 | 150567744;150595680;150598368;150612448;150566624 | |
463 | resnetv24_stage3_conv104_fwd | Convolution | [256,256,14,14] | 3404091.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 262.33 | 269484032 | 151323957.33 | 54218592.00 | 48.20 | 1.31 | 1027.26 | true | 0.482236;0.482337;0.483585;0.482635;0.481694 | 269484032;269484032;269484032;269484032;269484032 | 54300864;54170528;54185248;54208192;54262336 | 151365312;151335872;151313280;151322720;151304480 | |
463 | resnetv24_stage3_conv104_fwd | Convolution | [256,256,14,14] | 3404091.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359978.67 | 9339146.67 | 30.70 | 0.45 | 325.01 | true | 0.309492;0.303121;0.318976;0.306820;0.304271 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2361600;2359552;2360832 | 9339040;9345280;9333120;9314976;9375872 | |
464 | resnetv24_stage3_batchnorm104_fwd | BatchNorm | [256,256,14,14] | 6344.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 51773002.67 | 55914218.67 | 88.20 | 0.75 | 566.96 | true | 0.881584;0.881670;0.881375;0.882051;0.881270 | 81264640;81264640;81264640;81264640;81264640 | 55915264;55922304;55882176;55928800;55905088 | 51763520;51771712;51778240;51778240;51769056 | |
465 | resnetv24_stage3_activation104 | Activation | [256,256,14,14] | 2756 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380586.67 | 51369770.67 | 95.70 | 0.25 | 188.90 | true | 0.956508;0.957740;0.957087;0.957200;0.957617 | 25690112;25690112;25690112;25690112;25690112 | 51366400;51379456;51371552;51371360;51363072 | 51380576;51389792;51380608;51380576;51380576 | |
466 | resnetv24_stage3_conv105_fwd | Convolution | [256,256,14,14] | 1416289 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2076.00 | 26409435136 | 158715466.67 | 152743680.00 | 24.90 | 84.79 | 12721.31 | false | 0.248944;0.249033;0.249044;0.249075;0.248983 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 158026080;157708896;168358080;160411424;155041824 | 152093056;152372736;156096288;153765248;151767040 | |
466 | resnetv24_stage3_conv105_fwd | Convolution | [256,256,14,14] | 1416289 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2773.33 | 5.80 | 0.00 | 0.00 | true | 0.057881;0.058060;0.058369;0.058092;0.058077 | 0;0;0;0;0 | 96;608;96;96;96 | 2688;3200;2944;2432;2688 | |
467 | resnetv24_stage3__plus34 | elemwise_add | [256,1024,14,14] | 13260.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 2710592.00 | 3120885.33 | 95.70 | 8.81 | 69.34 | true | 0.956903;0.957165;0.957274;0.956977;0.956541 | 51380224;51380224;51380224;51380224;51380224 | 6275360;4816096;128;128;3315552 | 6826656;5425056;352;288;3937248 | |
468 | resnetv24_stage3_batchnorm105_fwd | BatchNorm | [256,1024,14,14] | 25164.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 58242677.33 | 65105312.00 | 90.10 | 2.64 | 585.69 | true | 0.900629;0.900883;0.900335;0.900637;0.900360 | 325058560;325058560;325058560;325058560;325058560 | 58249984;61482368;84120960;48528800;54995680 | 62785376;69755520;90686016;55809664;62775040 | |
469 | resnetv24_stage3_activation105 | Activation | [256,1024,14,14] | 10977.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.33 | 102760448 | 64225696.00 | 65292416.00 | 98.80 | 0.79 | 189.48 | true | 0.987355;0.987573;0.987276;0.987995;0.987777 | 102760448;102760448;102760448;102760448;102760448 | 57802880;70648224;64225696;67436960;61014432 | 57797536;70643328;64222752;70646240;61011168 | |
470 | resnetv24_stage3_conv106_fwd | Convolution | [256,1024,14,14] | 1482411.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.67 | 26332364800 | 157839904.00 | 37792928.00 | 24.80 | 134.60 | 12565.15 | false | 0.247730;0.247921;0.247680;0.247059;0.247625 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 154826240;162846592;154704256;160919328;157774144 | 37849568;37794720;37532096;37849984;37734496 | |
470 | resnetv24_stage3_conv106_fwd | Convolution | [256,1024,14,14] | 1482411.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 3797.33 | 5.80 | 0.00 | 0.00 | true | 0.057933;0.058116;0.057912;0.058068;0.058109 | 0;0;0;0;0 | 96;2144;96;96;96 | 2432;6528;2432;7552;2432 | |
471 | resnetv24_stage3_batchnorm106_fwd | BatchNorm | [256,256,14,14] | 6304.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 52050634.67 | 55757898.67 | 88.20 | 0.75 | 564.34 | true | 0.882451;0.882832;0.882115;0.883240;0.881823 | 81264640;81264640;81264640;81264640;81264640 | 52039360;52042400;52058432;52058304;52051200 | 55810016;55764416;55725344;55692544;55783936 | |
472 | resnetv24_stage3_activation106 | Activation | [256,256,14,14] | 2830 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380661.33 | 51370880.00 | 95.70 | 0.25 | 188.90 | true | 0.956915;0.956937;0.957399;0.957526;0.958624 | 25690112;25690112;25690112;25690112;25690112 | 51369056;51376224;51367360;51380448;51358336 | 51380832;51380576;51380576;51383392;51380576 | |
473 | resnetv24_stage3_conv107_fwd | Convolution | [256,256,14,14] | 3415272 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.33 | 19365101568 | 83944768.00 | 77820384.00 | 24.70 | 119.71 | 12924.43 | false | 0.246694;0.246795;0.247024;0.246940;0.246552 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 81683648;86296064;80292384;86738208;83854592 | 77875200;77708320;78161312;77877632;76002944 | |
473 | resnetv24_stage3_conv107_fwd | Convolution | [256,256,14,14] | 3415272 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.67 | 251658240 | 51401056.00 | 150576970.67 | 47.10 | 1.25 | 912.91 | true | 0.471275;0.471469;0.470675;0.470510;0.470514 | 251658240;251658240;251658240;251658240;251658240 | 150568384;150551136;150570432;150594400;150592096 | 51402784;51391776;51411744;51381216;51408608 | |
473 | resnetv24_stage3_conv107_fwd | Convolution | [256,256,14,14] | 3415272 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.00 | 269484032 | 151345696.00 | 54225802.67 | 48.20 | 1.31 | 1032.51 | true | 0.482299;0.482620;0.482208;0.482266;0.482173 | 269484032;269484032;269484032;269484032;269484032 | 151357856;151322752;151337152;151353152;151346784 | 54283840;54374688;54089376;54304192;54082592 | |
473 | resnetv24_stage3_conv107_fwd | Convolution | [256,256,14,14] | 3415272 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9354261.33 | 30.50 | 0.45 | 318.50 | true | 0.305787;0.306211;0.302078;0.306471;0.302601 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2361344 | 9320608;9383424;9381120;9315840;9361056 | |
474 | resnetv24_stage3_batchnorm107_fwd | BatchNorm | [256,256,14,14] | 6335.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 51782048.00 | 55903402.67 | 88.20 | 0.75 | 564.34 | true | 0.882247;0.881878;0.881585;0.882231;0.881535 | 81264640;81264640;81264640;81264640;81264640 | 51767584;51783456;51775904;51786784;51788672 | 55893312;55914048;55916064;55879520;55902848 | |
475 | resnetv24_stage3_activation107 | Activation | [256,256,14,14] | 2753.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51374122.67 | 95.70 | 0.25 | 188.44 | true | 0.955501;0.956464;0.957629;0.958379;0.957540 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51374112;51376768;51371712;51373376;51374880 | |
476 | resnetv24_stage3_conv108_fwd | Convolution | [256,256,14,14] | 1424261.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2082.67 | 26409435136 | 157999808.00 | 152002677.33 | 24.90 | 85.19 | 12680.58 | false | 0.248987;0.248975;0.249013;0.249063;0.249042 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 153512128;161838368;156003936;156157120;164853536 | 152372960;151811200;151744096;151823872;155486496 | |
476 | resnetv24_stage3_conv108_fwd | Convolution | [256,256,14,14] | 1424261.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057914;0.058098;0.057910;0.058085;0.058053 | 0;0;0;0;0 | 2432;2432;2432;2432;2432 | 96;96;96;96;96 | |
477 | resnetv24_stage3__plus35 | elemwise_add | [256,1024,14,14] | 13191 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.00 | 51380224 | 4028138.67 | 4640682.67 | 95.70 | 5.93 | 69.43 | true | 0.956565;0.956571;0.956502;0.956780;0.956338 | 51380224;51380224;51380224;51380224;51380224 | 3584096;736;5120768;5217184;6412640 | 2946816;128;4576384;4561216;5799328 | |
478 | resnetv24_stage4_batchnorm0_fwd | BatchNorm | [256,1024,14,14] | 25097 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 553.67 | 325058560 | 58288032.00 | 65138314.67 | 90.20 | 2.63 | 587.10 | true | 0.901989;0.901929;0.902045;0.901892;0.902281 | 325058560;325058560;325058560;325058560;325058560 | 48569344;74482880;61526656;55045088;58292352 | 55825632;83763104;69805472;62804736;62804736 | |
479 | resnetv24_stage4_activation0 | Activation | [256,1024,14,14] | 10847 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.00 | 102760448 | 63155178.67 | 64225632.00 | 98.70 | 0.81 | 189.59 | true | 0.986972;0.987013;0.987430;0.987517;0.986904 | 102760448;102760448;102760448;102760448;102760448 | 61014144;70648224;67436672;61014432;61014432 | 64231968;70643456;64223904;64219744;64221024 | |
480 | resnetv24_stage4_conv0_fwd | Convolution | [256,1024,14,14] | 2895909.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 4131.67 | 52664729600 | 157180949.33 | 49061386.67 | 17.80 | 255.35 | 12746.61 | false | 0.177443;0.183702;0.177981;0.178379;0.178178 | 52664729600;52664729600;52664729600;52664729600;52664729600 | 156574880;159288288;156293024;158674944;153476160 | 49005600;49054368;49095968;49100768;49033824 | |
480 | resnetv24_stage4_conv0_fwd | Convolution | [256,1024,14,14] | 2895909.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.057924;0.058048;0.058328;0.058085;0.058036 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2688;2432;2688 | |
481 | resnetv24_stage4_batchnorm1_fwd | BatchNorm | [256,512,14,14] | 12372 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 280.67 | 162529280 | 103876362.67 | 111727328.00 | 89.30 | 0.75 | 579.08 | true | 0.892896;0.893296;0.892524;0.892202;0.891947 | 162529280;162529280;162529280;162529280;162529280 | 103867136;103878496;103878944;103871648;103890752 | 111733152;111729376;111712096;111719456;111742528 | |
482 | resnetv24_stage4_activation1 | Activation | [256,512,14,14] | 5450.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.67 | 51380224 | 102760800.00 | 102752650.67 | 97.50 | 0.25 | 190.53 | true | 0.975264;0.975758;0.975022;0.974834;0.974462 | 51380224;51380224;51380224;51380224;51380224 | 102761824;102760800;102760800;102760800;102760800 | 102780512;102748928;102753216;102754560;102750176 | |
483 | resnetv24_stage4_conv1_fwd | Convolution | [256,512,14,14] | 4150465.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x128_relu_small_nn_v1 | 5497.67 | 59202863104 | 21916714.67 | 9632480.00 | 15.50 | 1876.53 | 10768.72 | false | 0.154961;0.153315;0.153815;0.158997;0.156182 | 59202863104;59202863104;59202863104;59202863104;59202863104 | 22089632;23232992;21584416;21522976;22076096 | 9598816;9621888;9649376;9646208;9629344 | |
483 | resnetv24_stage4_conv1_fwd | Convolution | [256,512,14,14] | 4150465.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 1450.67 | 4.90 | 0.00 | 0.00 | true | 0.048592;0.049084;0.048909;0.049143;0.049061 | 0;0;0;0;0 | 1536;1280;1536;1280;1536 | 96;96;96;96;96 | |
484 | resnetv24_stage4_batchnorm2_fwd | BatchNorm | [256,512,7,7] | 6434 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 219.00 | 44957696 | 25819626.67 | 26769802.67 | 18.40 | 0.85 | 205.29 | true | 0.185900;0.184058;0.200717;0.183375;0.182748 | 44957696;44957696;44957696;44957696;44957696 | 25788032;25802432;25842464;25813984;25850944 | 27358400;26629440;27127808;26546816;26552160 | |
485 | resnetv24_stage4_activation2 | Activation | [256,512,7,7] | 1386 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.67 | 12845056 | 25690464.00 | 25722581.33 | 94.40 | 0.25 | 184.38 | true | 0.945120;0.943681;0.944582;0.944055;0.941747 | 12845056;12845056;12845056;12845056;12845056 | 25708992;25723072;25733440;25718560;25726112 | 25690464;25690464;25690464;25690464;25690464 | |
486 | resnetv24_stage4_conv2_fwd | Convolution | [256,512,7,7] | 1708421 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2180.33 | 26358054912 | 102760565.33 | 79080704.00 | 24.90 | 144.95 | 12089.00 | false | 0.249019;0.249165;0.249337;0.249343;0.249259 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 81131456;79707104;79320896;78125792;78214112 | 102325792;102897952;102223040;103057952;103154688 | |
486 | resnetv24_stage4_conv2_fwd | Convolution | [256,512,7,7] | 1708421 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 1450.67 | 4.90 | 0.00 | 0.00 | true | 0.048624;0.049690;0.048675;0.049366;0.049133 | 0;0;0;0;0 | 96;96;96;96;96 | 1536;1280;1536;1280;1536 | |
487 | resnetv24_stage4_conv3_fwd | Convolution | [256,1024,14,14] | 3516200 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x128_relu_interior_nn_v1 | 4382.67 | 52664729600 | 280478016.00 | 47013290.67 | 16.20 | 160.81 | 12016.59 | false | 0.159744;0.162305;0.162134;0.161647;0.161226 | 52664729600;52664729600;52664729600;52664729600;52664729600 | 283014400;282504160;283210176;267093024;275915488 | 49176192;47376512;45459456;46801856;46861504 | |
487 | resnetv24_stage4_conv3_fwd | Convolution | [256,1024,14,14] | 3516200 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 160.00 | 789.33 | 4.90 | 0.00 | 0.00 | true | 0.048656;0.049130;0.048654;0.049763;0.049051 | 0;0;0;0;0 | 160;2336;96;96;224 | 800;2656;416;960;608 | |
488 | resnetv24_stage4__plus0 | elemwise_add | [256,2048,7,7] | 6606 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 373.00 | 25690112 | 205536042.67 | 102489642.67 | 93.70 | 0.08 | 68.87 | true | 0.937404;0.937055;0.937431;0.937277;0.937132 | 25690112;25690112;25690112;25690112;25690112 | 205517888;205561696;205470496;205528544;205577056 | 102345056;102250496;102523648;102636000;102600224 | |
489 | resnetv24_stage4_batchnorm3_fwd | BatchNorm | [256,2048,7,7] | 25319 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 862.33 | 179830784 | 8109738.67 | 17688736.00 | 20.40 | 6.97 | 208.54 | true | 0.199250;0.207200;0.204333;0.211789;0.200387 | 179830784;179830784;179830784;179830784;179830784 | 8285856;7984640;8058720;5811712;8359232 | 18088800;17390400;17587008;12684928;18250432 | |
490 | resnetv24_stage4_activation3 | Activation | [256,2048,7,7] | 5439.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 271.00 | 51380224 | 102760800.00 | 102717909.33 | 97.50 | 0.25 | 189.59 | true | 0.975648;0.974999;0.975751;0.975160;0.974773 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102715488;102726048;102710080;102715360;102722880 | |
491 | resnetv24_stage4_conv4_fwd | Convolution | [256,2048,7,7] | 1773306 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2481.67 | 26319519744 | 126513589.33 | 16486112.00 | 24.10 | 184.05 | 10605.58 | false | 0.241792;0.238972;0.240142;0.242659;0.240462 | 26319519744;26319519744;26319519744;26319519744;26319519744 | 132704704;123279296;130709856;125551616;120776000 | 16386304;16553856;16808640;16518176;16338400 | |
491 | resnetv24_stage4_conv4_fwd | Convolution | [256,2048,7,7] | 1773306 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 1280.00 | 4.90 | 0.00 | 0.00 | true | 0.048651;0.049137;0.048706;0.049107;0.049120 | 0;0;0;0;0 | 96;96;96;96;96 | 1280;1280;1280;1280;1280 | |
492 | resnetv24_stage4_batchnorm4_fwd | BatchNorm | [256,512,7,7] | 6481 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 218.67 | 44957696 | 25745856.00 | 25963989.33 | 19.00 | 0.87 | 205.60 | true | 0.185568;0.201774;0.181708;0.185276;0.198233 | 44957696;44957696;44957696;44957696;44957696 | 25692000;26026720;25863648;26001600;26756320 | 25754272;25753152;25730144;25627648;25765632 | |
493 | resnetv24_stage4_activation4 | Activation | [256,512,7,7] | 1434.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.67 | 12845056 | 25690464.00 | 25724938.67 | 94.40 | 0.25 | 184.38 | true | 0.941750;0.945264;0.942909;0.946012;0.944925 | 12845056;12845056;12845056;12845056;12845056 | 25690464;25690464;25690464;25690464;25690464 | 25727648;25723840;25723328;25712000;25731424 | |
494 | resnetv24_stage4_conv5_fwd | Convolution | [256,512,7,7] | 4163114.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1640.67 | 19346227200 | 53263786.67 | 39500416.00 | 24.70 | 208.55 | 11791.68 | false | 0.247335;0.247209;0.247531;0.247203;0.246733 | 19346227200;19346227200;19346227200;19346227200;19346227200 | 55760800;63023104;53563072;49959776;50467488 | 37309824;36881408;44192512;44340384;36998912 | |
494 | resnetv24_stage4_conv5_fwd | Convolution | [256,512,7,7] | 4163114.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 138.67 | 125829120 | 25388938.67 | 75266528.00 | 42.20 | 1.25 | 907.42 | true | 0.419703;0.422322;0.422423;0.421093;0.422840 | 125829120;125829120;125829120;125829120;125829120 | 25388768;25384608;25378912;25397792;25393440 | 75291872;75237856;75292992;75269856;75233376 | |
494 | resnetv24_stage4_conv5_fwd | Convolution | [256,512,7,7] | 4163114.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 136.00 | 433586176 | 75627669.33 | 28524416.00 | 47.60 | 4.16 | 3188.13 | true | 0.475227;0.475697;0.476129;0.475767;0.476022 | 433586176;433586176;433586176;433586176;433586176 | 75617024;75627872;75618400;75636736;75681408 | 28419360;28463456;28565472;28544320;28741472 | |
494 | resnetv24_stage4_conv5_fwd | Convolution | [256,512,7,7] | 4163114.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 66.00 | 21233664 | 9440256.00 | 37413600.00 | 76.40 | 0.45 | 321.72 | true | 0.769627;0.763471;0.758620;0.770794;0.759237 | 21233664;21233664;21233664;21233664;21233664 | 9440256;9440256;9440256;9440256;9440256 | 37421056;37442432;37386496;37418656;37401088 | |
495 | resnetv24_stage4_batchnorm5_fwd | BatchNorm | [256,512,7,7] | 6358 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 219.00 | 44957696 | 25412384.00 | 27654250.67 | 19.90 | 0.85 | 205.29 | true | 0.199934;0.200109;0.197722;0.196007;0.197973 | 44957696;44957696;44957696;44957696;44957696 | 25413248;25407840;25410720;25417376;25413184 | 27587616;27738016;27631744;27659360;27671648 | |
496 | resnetv24_stage4_activation5 | Activation | [256,512,7,7] | 1382.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.33 | 12845056 | 25690464.00 | 25703946.67 | 94.50 | 0.25 | 185.27 | true | 0.945113;0.945485;0.945711;0.943222;0.941197 | 12845056;12845056;12845056;12845056;12845056 | 25690464;25690464;25695584;25690464;25690464 | 25710784;25682528;25726944;25705440;25695616 | |
497 | resnetv24_stage4_conv6_fwd | Convolution | [256,512,7,7] | 1733861.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2196.67 | 26358054912 | 103148853.33 | 79280288.00 | 24.90 | 144.48 | 11999.11 | false | 0.249312;0.249179;0.249317;0.249211;0.249337 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 102743520;103113856;102802656;106691584;103530048 | 77705792;80607456;80694432;78753504;78479904 | |
497 | resnetv24_stage4_conv6_fwd | Convolution | [256,512,7,7] | 1733861.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 1322.67 | 4.90 | 0.00 | 0.00 | true | 0.048769;0.049153;0.049682;0.049087;0.050000 | 0;0;0;0;0 | 96;96;96;96;96 | 1408;14848;1024;1280;1280 | |
498 | resnetv24_stage4__plus1 | elemwise_add | [256,2048,7,7] | 6747.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 373.67 | 25690112 | 205522496.00 | 102480960.00 | 93.70 | 0.08 | 68.75 | true | 0.937069;0.936612;0.937035;0.937414;0.937390 | 25690112;25690112;25690112;25690112;25690112 | 205474080;205500992;205556192;205510304;205583456 | 102062304;102439552;102577504;102564672;102438656 | |
499 | resnetv24_stage4_batchnorm6_fwd | BatchNorm | [256,2048,7,7] | 25208 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 862.33 | 179830784 | 7351072.00 | 16058005.33 | 20.70 | 7.68 | 208.54 | true | 0.213900;0.211827;0.208531;0.197671;0.201894 | 179830784;179830784;179830784;179830784;179830784 | 8425120;5662208;6208096;7745152;8099968 | 18410848;12395776;13536928;16986752;17650336 | |
500 | resnetv24_stage4_activation6 | Activation | [256,2048,7,7] | 5399.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 271.00 | 51380224 | 102760800.00 | 102743285.33 | 97.50 | 0.25 | 189.59 | true | 0.976452;0.975122;0.975361;0.975254;0.974828 | 51380224;51380224;51380224;51380224;51380224 | 102769248;102760800;102760800;102760800;102760800 | 102742336;102739648;102747872;102733728;102763904 | |
501 | resnetv24_stage4_conv7_fwd | Convolution | [256,2048,7,7] | 1762801.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2482.33 | 26319519744 | 129897738.67 | 16661834.67 | 24.20 | 179.58 | 10602.74 | false | 0.241034;0.241527;0.241983;0.241944;0.241031 | 26319519744;26319519744;26319519744;26319519744;26319519744 | 120794784;133280064;131309216;126468704;131915296 | 16868000;16724320;16453504;16681696;16579488 | |
501 | resnetv24_stage4_conv7_fwd | Convolution | [256,2048,7,7] | 1762801.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 1365.33 | 4.90 | 0.00 | 0.00 | true | 0.048579;0.049110;0.048675;0.049176;0.049077 | 0;0;0;0;0 | 96;96;96;96;96 | 1280;28416;1280;1536;1280 | |
502 | resnetv24_stage4_batchnorm7_fwd | BatchNorm | [256,512,7,7] | 6432 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 219.00 | 44957696 | 25688586.67 | 25883808.00 | 18.50 | 0.87 | 205.29 | true | 0.181635;0.190810;0.183167;0.184720;0.187030 | 44957696;44957696;44957696;44957696;44957696 | 25673984;25680896;25673088;25728992;25710880 | 25749600;25989824;25869600;25892128;25889696 | |
503 | resnetv24_stage4_activation7 | Activation | [256,512,7,7] | 1419 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.33 | 12845056 | 25690464.00 | 25726922.67 | 94.40 | 0.25 | 185.27 | true | 0.948215;0.944045;0.945037;0.941717;0.944267 | 12845056;12845056;12845056;12845056;12845056 | 25690464;25690464;25690464;25690464;25690464 | 25722720;25705408;25728512;25729536;25731296 | |
504 | resnetv24_stage4_conv8_fwd | Convolution | [256,512,7,7] | 4140253.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1643.33 | 19346227200 | 55607328.00 | 44055786.67 | 24.70 | 194.12 | 11772.55 | false | 0.247259;0.247592;0.246983;0.247298;0.247148 | 19346227200;19346227200;19346227200;19346227200;19346227200 | 43903296;37193856;44413472;43984128;44279936 | 49859520;56598080;56268352;55933792;54619840 | |
504 | resnetv24_stage4_conv8_fwd | Convolution | [256,512,7,7] | 4140253.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 136.00 | 125829120 | 25366730.67 | 75262069.33 | 42.10 | 1.25 | 925.21 | true | 0.419326;0.420720;0.420497;0.421298;0.421812 | 125829120;125829120;125829120;125829120;125829120 | 25364064;25370720;25365408;25361312;25371488 | 75239488;75287872;75342176;75258848;75159872 | |
504 | resnetv24_stage4_conv8_fwd | Convolution | [256,512,7,7] | 4140253.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 135.33 | 433586176 | 75625664.00 | 28539125.33 | 47.60 | 4.16 | 3203.85 | true | 0.475270;0.475391;0.476237;0.475482;0.475906 | 433586176;433586176;433586176;433586176;433586176 | 75643072;75611008;75639264;75575456;75626720 | 28739072;28674336;28394432;28430176;28512864 | |
504 | resnetv24_stage4_conv8_fwd | Convolution | [256,512,7,7] | 4140253.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 67.33 | 21233664 | 9437440.00 | 37450816.00 | 77.50 | 0.45 | 315.35 | true | 0.774525;0.776622;0.773145;0.779588;0.772148 | 21233664;21233664;21233664;21233664;21233664 | 37481472;37423008;37376000;37447968;37542944 | 9437440;9437440;9437440;9437440;9437440 | |
505 | resnetv24_stage4_batchnorm8_fwd | BatchNorm | [256,512,7,7] | 6384.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 219.00 | 44957696 | 25418570.67 | 27572405.33 | 19.70 | 0.85 | 205.29 | true | 0.200539;0.186597;0.194847;0.195279;0.202071 | 44957696;44957696;44957696;44957696;44957696 | 25418752;25428352;25420000;25416960;25411072 | 27626496;27423424;27550304;27612096;27554816 | |
506 | resnetv24_stage4_activation8 | Activation | [256,512,7,7] | 1344 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.67 | 12845056 | 25690464.00 | 25712768.00 | 94.50 | 0.25 | 184.38 | true | 0.945139;0.945034;0.944583;0.944380;0.946436 | 12845056;12845056;12845056;12845056;12845056 | 25693792;25690464;25690464;25690464;25690464 | 25707168;25706368;25714464;25721984;25716672 | |
507 | resnetv24_stage4_conv9_fwd | Convolution | [256,512,7,7] | 1709176.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2165.67 | 26358054912 | 102391989.33 | 80068085.33 | 24.90 | 144.46 | 12170.87 | false | 0.249287;0.249295;0.248876;0.249205;0.249230 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 99392736;102126560;103674016;101420192;103629216 | 80631424;79331424;80011520;79704640;80488096 | |
507 | resnetv24_stage4_conv9_fwd | Convolution | [256,512,7,7] | 1709176.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 1493.33 | 4.90 | 0.00 | 0.00 | true | 0.048611;0.049064;0.048675;0.049432;0.049087 | 0;0;0;0;0 | 96;96;96;96;96 | 1536;1408;1536;1408;1536 | |
508 | resnetv24_stage4__plus2 | elemwise_add | [256,2048,7,7] | 6647.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 372.67 | 25690112 | 205531744.00 | 102503989.33 | 93.70 | 0.08 | 68.94 | true | 0.937309;0.936991;0.937253;0.936862;0.936678 | 25690112;25690112;25690112;25690112;25690112 | 205552896;205576096;205516928;205525408;205498880 | 102633856;102358272;102620064;102463072;102428832 | |
509 | resnetv24_batchnorm2_fwd | BatchNorm | [256,2048,7,7] | 25190.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 863.00 | 179830784 | 8439285.33 | 18423776.00 | 20.20 | 6.69 | 208.38 | true | 0.215910;0.197227;0.199776;0.193907;0.207567 | 179830784;179830784;179830784;179830784;179830784 | 19239168;17395808;18237696;18781248;18252384 | 8816384;7963872;8351360;8602496;8364000 | |
510 | resnetv24_relu1_fwd | Activation | [256,2048,7,7] | 5439.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.00 | 51380224 | 102760800.00 | 102720896.00 | 97.50 | 0.25 | 190.30 | true | 0.975394;0.975320;0.975779;0.975350;0.974833 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102718272;102714400;102720192;102729376;102724224 | |
511 | resnetv24_pool1_fwd | Pooling | [256,2048,7,7] | 33892 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 238.67 | 37040128 | 177505333.33 | 5618496.00 | 60.60 | 0.20 | 155.20 | true | 0.606741;0.606560;0.606223;0.606309;0.606403 | 37040128;37040128;37040128;37040128;37040128 | 177185440;177024864;176573632;178305696;178576928 | 5640384;5611808;5638752;5604928;5592960 | |
513 | resnetv24_dense0_fwd | FullyConnected | [256,2048] | 58368 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x32_sliced1x4_tn | 139.00 | 1077936128 | 10693504.00 | 325760.00 | 12.50 | 97.82 | 7754.94 | false | 0.124604;0.124605;0.124603;0.124605;0.124605 | 1077936128;1077936128;1077936128;1077936128;1077936128 | 10690080;10694304;10696128;10706176;10689984 | 330912;327936;326144;317984;323200 | |
513 | resnetv24_dense0_fwd | FullyConnected | [256,2048] | 58368 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 5.67 | 256000 | 6016.00 | 1024.00 | 59.90 | 36.36 | 45.17 | false | 0.599103;0.590966;0.608134;0.589369;0.607626 | 256000;256000;256000;256000;256000 | 6016;6016;10112;6016;6016 | 1024;1152;1024;1024;1024 |
Showing 1 to 758 of 758 entries