GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_write_bytes | dram_read_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_write_bytes | dram_read_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | resnetv23_batchnorm0_fwd | BatchNorm | [256,3,224,224] | 12384.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 434.33 | 231604224 | 133268010.67 | 135791893.33 | 95.00 | 0.86 | 533.24 | true | 0.952310;0.954310;0.945225;0.952891;0.946209 | 231604224;231604224;231604224;231604224;231604224 | 132465184;130056736;132465184;134873664;154141472 | 135011936;132485920;134973888;137389856;156984288 | |
0 | resnetv23_batchnorm0_fwd | BatchNorm | [256,3,224,224] | 12384.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 1, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::ScalarExp<float>, float>) | 3.33 | 0 | 1504.00 | 5717.33 | 12.10 | 0.00 | 0.00 | true | 0.120988;0.121041;0.121268;0.120936;0.121204 | 0;0;0;0;0 | 5888;5632;5632;5376;9728 | 1504;1504;1504;1504;2784 | |
1 | resnetv23_conv0_fwd | Convolution | [256,3,224,224] | 4114475 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_medium_nn_v1 | 5133.00 | 62889394176 | 11424256.00 | 288808714.67 | 12.70 | 209.47 | 12251.98 | false | 0.127278;0.126780;0.127125;0.126901;0.247707 | 62889394176;62889394176;62889394176;62889394176;62889394176 | 285851456;294257952;286234016;285934176;456312736 | 11131488;11753216;11121248;11388064;16262944 | |
1 | resnetv23_conv0_fwd | Convolution | [256,3,224,224] | 4114475 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 2229.33 | 58496.00 | 7.40 | 0.00 | 0.00 | true | 0.073919;0.074760;0.074501;0.074046;0.073929 | 0;0;0;0;0 | 2144;2144;2144;2400;2400 | 45696;59136;58624;57728;59776 | |
2 | resnetv23_batchnorm1_fwd | BatchNorm | [256,64,112,112] | 182153.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2107.00 | 1241513984 | 140467637.33 | 560638933.33 | 95.40 | 1.77 | 589.23 | true | 0.954115;0.953830;0.954212;0.953911;0.951109 | 1241513984;1241513984;1241513984;1241513984;1241513984 | 140312800;140715200;140374912;140258048;198924480 | 559773792;561348032;573128768;560794976;452069056 | |
3 | resnetv23_relu0_fwd | Activation | [256,64,112,112] | 43072.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2273.00 | 411041792 | 139134752.00 | 559231776.00 | 99.90 | 0.59 | 180.84 | true | 0.998936;0.998986;0.998991;0.998999;0.999001 | 411041792;411041792;411041792;411041792;411041792 | 138406496;138451936;139326720;139625600;204944896 | 559552480;560359008;561388256;557783840;455138368 | |
4 | resnetv23_pool0_fwd | Pooling | [256,64,112,112] | 649161.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 1668.00 | 51380224 | 177097909.33 | 140263498.67 | 72.30 | 0.16 | 30.80 | true | 0.723096;0.722874;0.722887;0.722949;0.722892 | 51380224;51380224;51380224;51380224;51380224 | 176361376;177415488;177723840;177516864;170414688 | 141065312;140707680;140311840;139770976;101669408 | |
5 | resnetv23_stage1_batchnorm0_fwd | BatchNorm | [256,64,56,56] | 51277.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 512.33 | 316669952 | 93127754.67 | 90286677.33 | 87.20 | 1.73 | 618.09 | true | 0.871710;0.872124;0.872535;0.871971;0.872180 | 316669952;316669952;316669952;316669952;316669952 | 89178880;89175104;92447328;89233824;203966560 | 93129440;93126944;93126880;93126880;205524384 | |
6 | resnetv23_stage1_activation0 | Activation | [256,64,56,56] | 10790.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.00 | 102760448 | 79212373.33 | 81330848.00 | 98.70 | 0.64 | 189.59 | true | 0.987696;0.986691;0.987476;0.987030;0.987644 | 102760448;102760448;102760448;102760448;102760448 | 61016896;61014144;77070688;99549536;205521248 | 64208928;57792768;77055520;102728096;205470784 | |
7 | resnetv23_stage1_conv0_fwd | Convolution | [256,64,56,56] | 480594.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 586.00 | 6679429120 | 21415573.33 | 23546058.67 | 24.50 | 148.56 | 11398.34 | false | 0.245253;0.245227;0.245290;0.245226;0.245367 | 6679429120;6679429120;6679429120;6679429120;6679429120 | 16059776;32124032;16060800;16061888;35333696 | 19265440;32108192;19262496;19264544;35319872 | |
7 | resnetv23_stage1_conv0_fwd | Convolution | [256,64,56,56] | 480594.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 522.67 | 26368.00 | 6.10 | 0.00 | 0.00 | true | 0.061168;0.061196;0.061168;0.061180;0.061198 | 0;0;0;0;0 | 608;608;608;352;352 | 26496;26624;26368;26240;26112 | |
8 | resnetv23_stage1_batchnorm1_fwd | BatchNorm | [256,64,56,56] | 14587 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 515.33 | 316669952 | 97412362.67 | 97461322.67 | 87.20 | 1.63 | 614.50 | true | 0.872228;0.872222;0.872335;0.872136;0.872573 | 316669952;316669952;316669952;316669952;316669952 | 105978976;112396704;93128928;89921952;93129184 | 109244832;112452352;89969312;89969312;93169824 | |
9 | resnetv23_stage1_activation1 | Activation | [256,64,56,56] | 10692.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.00 | 102760448 | 65296053.33 | 66349216.00 | 98.80 | 0.78 | 190.30 | true | 0.987758;0.987631;0.987017;0.987842;0.987404 | 102760448;102760448;102760448;102760448;102760448 | 83461664;57785472;67419424;64205984;67422240 | 83493216;61014272;67436896;61014368;67436896 | |
10 | resnetv23_stage1_conv1_fwd | Convolution | [256,64,56,56] | 3959041.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 2963.00 | 31855738880 | 75490496.00 | 144373568.00 | 24.80 | 144.89 | 10751.18 | false | 0.247979;0.247957;0.247987;0.247921;0.247909 | 31855738880;31855738880;31855738880;31855738880;31855738880 | 144273184;143639104;142211712;148899488;145208416 | 76858592;73783040;73026688;83918720;75829856 | |
10 | resnetv23_stage1_conv1_fwd | Convolution | [256,64,56,56] | 3959041.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 7.00 | 237568 | 154304.00 | 263040.00 | 6.20 | 0.57 | 33.94 | true | 0.062438;0.062439;0.062441;0.062435;0.062435 | 237568;237568;237568;237568;237568 | 154304;154304;154304;154304;154304 | 262784;262400;292224;263040;263296 | |
11 | resnetv23_stage1_batchnorm2_fwd | BatchNorm | [256,64,56,56] | 14597.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 515.33 | 316669952 | 90986058.67 | 89966773.33 | 87.20 | 1.75 | 614.50 | true | 0.872066;0.872431;0.871912;0.872688;0.872486 | 316669952;316669952;316669952;316669952;316669952 | 89915616;89915680;86704256;93126880;93126880 | 88904224;88908576;88910176;92093536;92081568 | |
12 | resnetv23_stage1_activation2 | Activation | [256,64,56,56] | 10822.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.00 | 102760448 | 68507242.67 | 68484256.00 | 98.70 | 0.75 | 189.95 | true | 0.987268;0.987194;0.987304;0.987579;0.987416 | 102760448;102760448;102760448;102760448;102760448 | 80281952;64225408;61014080;77070688;64225632 | 80258080;64203264;64207136;77038496;64207136 | |
13 | resnetv23_stage1_conv2_fwd | Convolution | [256,64,56,56] | 1576907 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2293.33 | 26717716480 | 129067221.33 | 541682592.00 | 24.80 | 39.83 | 11650.17 | false | 0.248041;0.248047;0.248061;0.248055;0.248056 | 26717716480;26717716480;26717716480;26717716480;26717716480 | 129024352;128852928;128060320;131439424;129324384 | 545756064;543822208;528707808;539623040;541602528 | |
13 | resnetv23_stage1_conv2_fwd | Convolution | [256,64,56,56] | 1576907 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 608.00 | 26496.00 | 6.10 | 0.00 | 0.00 | true | 0.061192;0.061200;0.061178;0.061218;0.061204 | 0;0;0;0;0 | 608;608;608;608;608 | 26496;26496;26368;26496;26496 | |
14 | resnetv23_stage1_conv3_fwd | Convolution | [256,64,56,56] | 1575131 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2293.67 | 26717716480 | 128395189.33 | 539690389.33 | 24.80 | 39.99 | 11648.47 | false | 0.248055;0.248052;0.248052;0.248052;0.248054 | 26717716480;26717716480;26717716480;26717716480;26717716480 | 128207648;128713568;127959872;128264352;137452768 | 541579584;534194624;536971840;540519744;543006144 | |
14 | resnetv23_stage1_conv3_fwd | Convolution | [256,64,56,56] | 1575131 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 27264.00 | 6.10 | 0.00 | 0.00 | true | 0.061363;0.061180;0.061194;0.061171;0.061187 | 0;0;0;0;0 | 27648;26624;27264;26880;41600 | 96;96;96;96;96 | |
15 | resnetv23_stage1__plus0 | elemwise_add | [256,256,56,56] | 52516.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 2980.00 | 205520896 | 198920874.67 | 501946261.33 | 97.60 | 0.29 | 68.97 | true | 0.975505;0.975588;0.975577;0.975657;0.975490 | 205520896;205520896;205520896;205520896;205520896 | 513931264;495463072;506067872;500452192;499318720 | 199189888;197681664;209119744;197817760;199754976 | |
16 | resnetv23_stage1_batchnorm3_fwd | BatchNorm | [256,256,56,56] | 57708.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2041.33 | 1266679808 | 144221525.33 | 577483125.33 | 87.40 | 1.76 | 620.52 | true | 0.874620;0.874316;0.874222;0.874183;0.874395 | 1266679808;1266679808;1266679808;1266679808;1266679808 | 143939072;144067040;151100544;143851040;144658464 | 571848192;578235264;582075072;579759680;574454432 | |
17 | resnetv23_stage1_activation3 | Activation | [256,256,56,56] | 42778 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2247.33 | 411041792 | 138728000.00 | 554144672.00 | 99.90 | 0.59 | 182.90 | true | 0.998989;0.999003;0.998987;0.998990;0.998991 | 411041792;411041792;411041792;411041792;411041792 | 138513536;138839456;144663488;137588992;138831008 | 549856032;557364064;563850752;555213920;549072128 | |
18 | resnetv23_stage1_conv4_fwd | Convolution | [256,256,56,56] | 1677905.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2056.33 | 26409435136 | 145508970.67 | 143091424.00 | 24.90 | 91.51 | 12842.98 | false | 0.248998;0.248698;0.248667;0.248941;0.248964 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 145123584;145027584;151739360;145279200;146124128 | 142320224;142270784;144515808;142438240;145359232 | |
18 | resnetv23_stage1_conv4_fwd | Convolution | [256,256,56,56] | 1677905.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 1802.67 | 29568.00 | 6.10 | 0.00 | 0.00 | true | 0.061168;0.061192;0.061292;0.061193;0.061189 | 0;0;0;0;0 | 26240;36480;25856;52096;25984 | 96;5216;96;6752;96 | |
19 | resnetv23_stage1_batchnorm4_fwd | BatchNorm | [256,64,56,56] | 14356.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 514.33 | 316669952 | 90988256.00 | 90985066.67 | 87.20 | 1.74 | 615.69 | true | 0.872407;0.872455;0.872021;0.872131;0.872123 | 316669952;316669952;316669952;316669952;316669952 | 89917664;89918240;99551328;89917280;93128864 | 89928992;89932928;102780192;89923232;93093280 | |
20 | resnetv23_stage1_activation4 | Activation | [256,64,56,56] | 10812.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 543.00 | 102760448 | 68507221.33 | 66347210.67 | 98.70 | 0.76 | 189.25 | true | 0.986981;0.987158;0.986741;0.987622;0.986739 | 102760448;102760448;102760448;102760448;102760448 | 61014080;118819168;67436608;67436896;70648160 | 61001504;118782880;64210848;64204832;70625952 | |
21 | resnetv23_stage1_conv5_fwd | Convolution | [256,64,56,56] | 3957514.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 2955.67 | 31855738880 | 75474048.00 | 143269098.67 | 24.80 | 145.63 | 10777.85 | false | 0.247970;0.247927;0.247932;0.247938;0.247982 | 31855738880;31855738880;31855738880;31855738880;31855738880 | 74461664;76915136;79548608;74975328;74531680 | 142244608;143294208;146155328;143620992;142892096 | |
21 | resnetv23_stage1_conv5_fwd | Convolution | [256,64,56,56] | 3957514.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 237568 | 147648.00 | 262528.00 | 6.20 | 0.58 | 50.90 | true | 0.062368;0.062361;0.062360;0.062365;0.062364 | 237568;237568;237568;237568;237568 | 252160;262400;262400;262784;263168 | 147648;147648;147648;147648;147648 | |
22 | resnetv23_stage1_batchnorm5_fwd | BatchNorm | [256,64,56,56] | 14539 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 515.00 | 316669952 | 92060106.67 | 91027285.33 | 87.20 | 1.73 | 614.89 | true | 0.872537;0.872171;0.872723;0.872518;0.872399 | 316669952;316669952;316669952;316669952;316669952 | 89918112;102763488;93132384;89918880;93129056 | 88904128;101605312;92080352;88910784;92090720 | |
23 | resnetv23_stage1_activation5 | Activation | [256,64,56,56] | 10787.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 537.67 | 102760448 | 65295861.33 | 65275936.00 | 98.70 | 0.79 | 191.12 | true | 0.987206;0.986337;0.987443;0.986743;0.987442 | 102760448;102760448;102760448;102760448;102760448 | 67436608;70648224;61014080;57803104;67436896 | 70630944;70631808;60993184;57783072;64203680 | |
24 | resnetv23_stage1_conv6_fwd | Convolution | [256,64,56,56] | 1586566 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2293.33 | 26717716480 | 128593696.00 | 536632426.67 | 24.80 | 40.16 | 11650.17 | false | 0.248059;0.248041;0.248057;0.248059;0.248057 | 26717716480;26717716480;26717716480;26717716480;26717716480 | 128101888;133042752;128536960;128083104;129142240 | 535783808;541283232;531112096;536688192;537425280 | |
24 | resnetv23_stage1_conv6_fwd | Convolution | [256,64,56,56] | 1586566 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 26069.33 | 6.10 | 0.00 | 0.00 | true | 0.061192;0.061235;0.061185;0.061172;0.061194 | 0;0;0;0;0 | 96;96;96;96;96 | 26368;25984;25856;26240;25984 | |
25 | resnetv23_stage1__plus1 | elemwise_add | [256,256,56,56] | 52744.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 2987.33 | 205520896 | 202382357.33 | 499862645.33 | 97.60 | 0.29 | 68.80 | true | 0.975563;0.975733;0.975745;0.975674;0.975587 | 205520896;205520896;205520896;205520896;205520896 | 199940640;212181344;207679104;198166592;199527328 | 495410976;509380544;507382656;493331424;496794304 | |
26 | resnetv23_stage1_batchnorm6_fwd | BatchNorm | [256,256,56,56] | 57764.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2041.00 | 1266679808 | 144456938.67 | 578563338.67 | 87.40 | 1.75 | 620.62 | true | 0.874493;0.874295;0.874213;0.874671;0.874655 | 1266679808;1266679808;1266679808;1266679808;1266679808 | 143935328;143480416;146128192;144233632;145201856 | 578156032;578332416;580802880;579201568;570513728 | |
27 | resnetv23_stage1_activation6 | Activation | [256,256,56,56] | 42870.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2258.00 | 411041792 | 139537077.33 | 559735477.33 | 99.90 | 0.59 | 182.04 | true | 0.998972;0.998996;0.998865;0.998985;0.998995 | 411041792;411041792;411041792;411041792;411041792 | 139689504;140802336;139065056;138671328;139856672 | 558427456;564234432;560769600;556826272;560009376 | |
28 | resnetv23_stage1_conv7_fwd | Convolution | [256,256,56,56] | 1676201.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2055.00 | 26409435136 | 147965216.00 | 143608938.67 | 24.90 | 90.58 | 12851.31 | false | 0.248968;0.248699;0.248656;0.248960;0.248864 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 145459296;142602912;143663360;143924416;143239040 | 151208320;145530400;147955648;149246336;146693664 | |
28 | resnetv23_stage1_conv7_fwd | Convolution | [256,256,56,56] | 1676201.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 26069.33 | 6.10 | 0.00 | 0.00 | true | 0.061357;0.061181;0.061228;0.061194;0.061219 | 0;0;0;0;0 | 2144;96;96;96;96 | 30080;25984;25856;25984;26240 | |
29 | resnetv23_stage1_batchnorm7_fwd | BatchNorm | [256,64,56,56] | 14461.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 515.00 | 316669952 | 99552032.00 | 98501525.33 | 87.20 | 1.60 | 614.89 | true | 0.872299;0.872159;0.872591;0.872279;0.872776 | 316669952;316669952;316669952;316669952;316669952 | 128452704;99551712;89917664;86706720;109186720 | 128404384;96343296;89952928;89926304;109208352 | |
30 | resnetv23_stage1_activation7 | Activation | [256,64,56,56] | 10826.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.33 | 102760448 | 69577664.00 | 71703274.67 | 98.70 | 0.73 | 189.83 | true | 0.987581;0.987160;0.986888;0.987464;0.987514 | 102760448;102760448;102760448;102760448;102760448 | 80281952;67436672;54592064;80285024;61014368 | 80261024;70642048;51360256;83473440;64206752 | |
31 | resnetv23_stage1_conv8_fwd | Convolution | [256,64,56,56] | 3957203.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 2956.00 | 31855738880 | 77367008.00 | 144118869.33 | 24.80 | 143.83 | 10776.64 | false | 0.247941;0.247960;0.247938;0.247941;0.247949 | 31855738880;31855738880;31855738880;31855738880;31855738880 | 84612832;77852032;76849152;77399840;73863552 | 148209920;143266368;144698368;144391872;142674912 | |
31 | resnetv23_stage1_conv8_fwd | Convolution | [256,64,56,56] | 3957203.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 237568 | 147648.00 | 288469.33 | 6.20 | 0.54 | 54.83 | true | 0.062365;0.062360;0.062364;0.062364;0.062362 | 237568;237568;237568;237568;237568 | 290048;278016;290176;285440;289920 | 147648;147648;147648;147648;147648 | |
32 | resnetv23_stage1_batchnorm8_fwd | BatchNorm | [256,64,56,56] | 14671.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 515.00 | 316669952 | 90988256.00 | 89961280.00 | 87.20 | 1.75 | 614.89 | true | 0.872231;0.872608;0.872212;0.872025;0.871753 | 316669952;316669952;316669952;316669952;316669952 | 89917536;86706592;125241696;89917664;93129568 | 88893664;88905152;126996384;88900000;92078688 | |
33 | resnetv23_stage1_activation8 | Activation | [256,64,56,56] | 10831.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.00 | 102760448 | 63155210.67 | 64213322.67 | 98.70 | 0.81 | 189.95 | true | 0.987064;0.987230;0.986798;0.986745;0.987560 | 102760448;102760448;102760448;102760448;102760448 | 64225632;57802816;64225632;67436896;61014368 | 64216224;57788576;64213152;67418144;64210592 | |
34 | resnetv23_stage1_conv9_fwd | Convolution | [256,64,56,56] | 1574216.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2291.00 | 26717716480 | 128847872.00 | 538169941.33 | 24.80 | 40.06 | 11662.03 | false | 0.248059;0.248059;0.248053;0.248048;0.248046 | 26717716480;26717716480;26717716480;26717716480;26717716480 | 539934368;537446976;538587200;533555648;538475648 | 127950368;128787616;129808416;128195904;129560096 | |
34 | resnetv23_stage1_conv9_fwd | Convolution | [256,64,56,56] | 1574216.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 25984.00 | 6.10 | 0.00 | 0.00 | true | 0.061332;0.061195;0.061165;0.061208;0.061168 | 0;0;0;0;0 | 96;96;96;96;96 | 25984;25856;25856;26112;32384 | |
35 | resnetv23_stage1__plus2 | elemwise_add | [256,256,56,56] | 52719 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 2979.67 | 205520896 | 199443008.00 | 498678720.00 | 97.60 | 0.29 | 68.97 | true | 0.975415;0.975631;0.975676;0.975507;0.975628 | 205520896;205520896;205520896;205520896;205520896 | 198217728;200763488;206046848;197982336;199347808 | 511312736;496864192;502240128;496931840;496413760 | |
36 | resnetv23_stage2_batchnorm0_fwd | BatchNorm | [256,256,56,56] | 57774.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 2041.00 | 1266679808 | 145922133.33 | 579299989.33 | 87.40 | 1.75 | 620.62 | true | 0.874408;0.874434;0.874480;0.874163;0.873889 | 1266679808;1266679808;1266679808;1266679808;1266679808 | 145801312;144215584;147749504;143845376;148872960 | 580586112;576924160;580389696;573952384;580636832 | |
37 | resnetv23_stage2_activation0 | Activation | [256,256,56,56] | 43156 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 2276.00 | 411041792 | 139769898.67 | 558770666.67 | 99.90 | 0.59 | 180.60 | true | 0.998947;0.998996;0.998994;0.999000;0.998999 | 411041792;411041792;411041792;411041792;411041792 | 138894336;138896000;141519360;137966016;143245824 | 553905664;557760224;562520320;556031456;563280096 | |
38 | resnetv23_stage2_conv0_fwd | Convolution | [256,256,56,56] | 3104501.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 4088.33 | 52818870272 | 145954549.33 | 180797706.67 | 17.70 | 161.65 | 12919.41 | false | 0.176766;0.176239;0.181728;0.176521;0.178424 | 52818870272;52818870272;52818870272;52818870272;52818870272 | 147213888;145290240;145265344;145359520;150150496 | 180924512;179557472;184604384;179551712;181911136 | |
38 | resnetv23_stage2_conv0_fwd | Convolution | [256,256,56,56] | 3104501.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 26026.67 | 6.10 | 0.00 | 0.00 | true | 0.061174;0.061174;0.061181;0.061184;0.061189 | 0;0;0;0;0 | 96;96;96;96;96 | 25984;26112;25856;45440;25984 | |
39 | resnetv23_stage2_batchnorm1_fwd | BatchNorm | [256,128,56,56] | 29019.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1023.00 | 633339904 | 60900640.00 | 122377141.33 | 87.40 | 3.46 | 619.10 | true | 0.873615;0.873723;0.873757;0.873744;0.873482 | 633339904;633339904;633339904;633339904;633339904 | 124165920;123115488;111679744;122668448;121347488 | 61796512;61263872;55546528;61063616;60374432 | |
40 | resnetv23_stage2_activation1 | Activation | [256,128,56,56] | 21639 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1105.00 | 205520896 | 65303253.33 | 131284672.00 | 99.10 | 1.05 | 185.99 | true | 0.991647;0.991609;0.991371;0.991417;0.991226 | 205520896;205520896;205520896;205520896;205520896 | 136910784;105725216;127469632;134848192;131536192 | 68108608;52538816;63397216;67079008;65433536 | |
41 | resnetv23_stage2_conv1_fwd | Convolution | [256,128,56,56] | 3582300.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x128_relu_small_nn_v1 | 4577.67 | 59241398272 | 33987445.33 | 38483914.67 | 15.10 | 817.45 | 12941.40 | false | 0.149837;0.151238;0.151403;0.151350;0.151331 | 59241398272;59241398272;59241398272;59241398272;59241398272 | 33530560;33452800;34865760;33708320;34723456 | 38196544;38119680;38741248;38874048;38513952 | |
41 | resnetv23_stage2_conv1_fwd | Convolution | [256,128,56,56] | 3582300.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7168.00 | 5.90 | 0.00 | 0.00 | true | 0.059637;0.059368;0.059386;0.059765;0.059385 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7552;7168;7168;7168 | |
42 | resnetv23_stage2_batchnorm2_fwd | BatchNorm | [256,128,28,28] | 7186 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.00 | 162529280 | 102763232.00 | 102757802.67 | 83.50 | 0.79 | 625.11 | true | 0.835349;0.834701;0.834445;0.834383;0.834835 | 162529280;162529280;162529280;162529280;162529280 | 102747264;102754560;102751968;102766880;102775648 | 102763488;102763360;102763104;102762848;102763232 | |
43 | resnetv23_stage2_activation2 | Activation | [256,128,28,28] | 5455.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.00 | 51380224 | 102760800.00 | 102536384.00 | 97.60 | 0.25 | 191.00 | true | 0.975313;0.975814;0.975571;0.975509;0.975703 | 51380224;51380224;51380224;51380224;51380224 | 102766688;102760800;102760800;102760800;102760800 | 102540096;102532416;102533952;102540096;102535104 | |
44 | resnetv23_stage2_conv2_fwd | Convolution | [256,128,28,28] | 1482440 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2112.33 | 26512195584 | 141183125.33 | 280645045.33 | 24.90 | 62.85 | 12551.14 | false | 0.248694;0.248682;0.248677;0.248676;0.248682 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 141684928;141688160;144835232;139748960;140176288 | 283477536;280380544;282045280;277801248;279509312 | |
44 | resnetv23_stage2_conv2_fwd | Convolution | [256,128,28,28] | 1482440 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 352.00 | 7509.33 | 5.90 | 0.00 | 0.00 | true | 0.059374;0.059360;0.059514;0.059391;0.059388 | 0;0;0;0;0 | 352;352;352;352;352 | 10752;7680;7424;7424;7424 | |
45 | resnetv23_stage2_conv3_fwd | Convolution | [256,256,56,56] | 2920950.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x128_relu_interior_nn_v1 | 4343.00 | 52818870272 | 156974314.67 | 168587658.67 | 16.30 | 162.24 | 12161.84 | false | 0.163171;0.161499;0.162775;0.162214;0.164045 | 52818870272;52818870272;52818870272;52818870272;52818870272 | 154613280;154358272;164097696;161687424;154622240 | 168326976;165697984;171380288;170825600;166610400 | |
45 | resnetv23_stage2_conv3_fwd | Convolution | [256,256,56,56] | 2920950.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 7381.33 | 5.90 | 0.00 | 0.00 | true | 0.059367;0.059374;0.059383;0.059360;0.059382 | 0;0;0;0;0 | 96;96;6752;96;96 | 5504;4800;12416;11072;5568 | |
46 | resnetv23_stage2__plus0 | elemwise_add | [256,512,28,28] | 26292.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1476.67 | 102760448 | 201454165.33 | 209924394.67 | 96.50 | 0.25 | 69.59 | true | 0.965209;0.965208;0.965205;0.965187;0.965278 | 102760448;102760448;102760448;102760448;102760448 | 215816320;202610848;200897344;199896384;200854304 | 214443456;210297664;210133408;204550240;209342112 | |
47 | resnetv23_stage2_batchnorm3_fwd | BatchNorm | [256,512,28,28] | 28983.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1036.00 | 650117120 | 62041653.33 | 124524608.00 | 84.60 | 3.48 | 627.53 | true | 0.846099;0.846095;0.845975;0.846159;0.846501 | 650117120;650117120;650117120;650117120;650117120 | 59569952;62698816;60414688;63159072;63011456 | 119610880;125841152;121270432;126777280;126462240 | |
48 | resnetv23_stage2_activation3 | Activation | [256,512,28,28] | 21518 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1092.33 | 205520896 | 66190197.33 | 133087637.33 | 99.20 | 1.03 | 188.15 | true | 0.991535;0.991594;0.991800;0.991998;0.991712 | 205520896;205520896;205520896;205520896;205520896 | 63475136;64990176;66495552;68018656;67084864 | 127611904;130668288;133706048;136756416;134888576 | |
49 | resnetv23_stage2_conv4_fwd | Convolution | [256,512,28,28] | 1558238.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2106.33 | 26358054912 | 141788501.33 | 66389472.00 | 24.90 | 126.61 | 12513.72 | false | 0.249227;0.248917;0.248844;0.248631;0.248798 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 141367616;141482240;144686368;141412320;142470944 | 67003136;66367040;66596064;66094592;66205312 | |
49 | resnetv23_stage2_conv4_fwd | Convolution | [256,512,28,28] | 1558238.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 9472.00 | 5.90 | 0.00 | 0.00 | true | 0.059402;0.059401;0.059376;0.059668;0.059362 | 0;0;0;0;0 | 7168;13824;7168;14336;7424 | 96;96;96;96;96 | |
50 | resnetv23_stage2_batchnorm4_fwd | BatchNorm | [256,128,28,28] | 7346 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.00 | 162529280 | 102763317.33 | 102711669.33 | 83.50 | 0.79 | 625.11 | true | 0.834775;0.835212;0.834811;0.835114;0.834392 | 162529280;162529280;162529280;162529280;162529280 | 102762848;102773216;102763488;102763104;102763360 | 102706144;102744704;102699680;102701536;102727328 | |
51 | resnetv23_stage2_activation4 | Activation | [256,128,28,28] | 5564 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.00 | 51380224 | 102760800.00 | 102532352.00 | 97.50 | 0.25 | 190.30 | true | 0.975321;0.975622;0.974879;0.975008;0.975592 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102537472;102526784;102532864;102534400;102529792 | |
52 | resnetv23_stage2_conv5_fwd | Convolution | [256,128,28,28] | 3544991.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 1176.33 | 14060093440 | 47248522.67 | 63659104.00 | 16.80 | 126.77 | 11952.48 | false | 0.169194;0.168472;0.168470;0.167678;0.167696 | 14060093440;14060093440;14060093440;14060093440;14060093440 | 47974112;48138240;45633216;48992672;44696928 | 64533440;64768832;61675040;65348992;59785408 | |
52 | resnetv23_stage2_conv5_fwd | Convolution | [256,128,28,28] | 3544991.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 363.33 | 739770368 | 103576522.67 | 142748117.33 | 47.80 | 3.00 | 2036.07 | true | 0.478576;0.477854;0.478403;0.478279;0.478610 | 739770368;739770368;739770368;739770368;739770368 | 103554592;103545568;103575584;103638048;103599392 | 142709568;142783072;142806528;142751712;142640832 | |
52 | resnetv23_stage2_conv5_fwd | Convolution | [256,128,28,28] | 3544991.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 338.33 | 704643072 | 142752853.33 | 110743552.00 | 47.60 | 2.78 | 2082.69 | true | 0.475969;0.476049;0.476145;0.476386;0.474885 | 704643072;704643072;704643072;704643072;704643072 | 110678400;110739456;110751488;110771360;110739712 | 142739328;142762848;142753120;142742592;142764576 | |
52 | resnetv23_stage2_conv5_fwd | Convolution | [256,128,28,28] | 3544991.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 113.67 | 369885184 | 619125.33 | 70797632.00 | 46.70 | 5.18 | 3254.11 | true | 0.465436;0.466210;0.467816;0.467596;0.467391 | 369885184;369885184;369885184;369885184;369885184 | 619808;619040;619296;618976;619040 | 70811584;70818240;70704064;70771904;70809408 | |
53 | resnetv23_stage2_batchnorm5_fwd | BatchNorm | [256,128,28,28] | 7326.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 261.00 | 162529280 | 102764426.67 | 103019968.00 | 83.50 | 0.79 | 622.72 | true | 0.835040;0.835332;0.835091;0.835691;0.835335 | 162529280;162529280;162529280;162529280;162529280 | 102764256;102764384;102764640;102764512;102764384 | 103012800;103024544;103014432;103020928;103025408 | |
54 | resnetv23_stage2_activation5 | Activation | [256,128,28,28] | 5486.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.00 | 51380224 | 102761056.00 | 102531776.00 | 97.50 | 0.25 | 190.30 | true | 0.975588;0.974711;0.975998;0.974747;0.975107 | 51380224;51380224;51380224;51380224;51380224 | 102761056;102761056;102761056;102761056;102761056 | 102538944;102527488;102530240;102533824;102531264 | |
55 | resnetv23_stage2_conv6_fwd | Convolution | [256,128,28,28] | 1482742.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2113.67 | 26512195584 | 143035189.33 | 280622965.33 | 24.90 | 62.58 | 12543.22 | false | 0.248688;0.248681;0.248685;0.248674;0.248681 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 144359520;145036928;139709120;138989888;148010784 | 281413280;283034048;277421568;275190400;289554048 | |
55 | resnetv23_stage2_conv6_fwd | Convolution | [256,128,28,28] | 1482742.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 1717.33 | 8789.33 | 6.00 | 0.00 | 0.00 | true | 0.059789;0.059776;0.059822;0.059982;0.059798 | 0;0;0;0;0 | 1632;1632;1632;1888;1888 | 8704;8704;8960;8960;8704 | |
56 | resnetv23_stage2__plus1 | elemwise_add | [256,512,28,28] | 26230.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1477.00 | 102760448 | 200553738.67 | 208892821.33 | 96.50 | 0.25 | 69.57 | true | 0.964940;0.965076;0.964975;0.965108;0.965084 | 102760448;102760448;102760448;102760448;102760448 | 199602528;199436480;205293984;200181088;201877600 | 211187328;205955040;209866496;210856928;203185216 | |
57 | resnetv23_stage2_batchnorm6_fwd | BatchNorm | [256,512,28,28] | 28937.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1036.00 | 650117120 | 61206314.67 | 122880288.00 | 84.60 | 3.53 | 627.53 | true | 0.846294;0.846433;0.846527;0.846270;0.846410 | 650117120;650117120;650117120;650117120;650117120 | 63577152;59916128;61233280;59593856;62469536 | 127618880;120303296;122942208;119635232;125395360 | |
58 | resnetv23_stage2_activation6 | Activation | [256,512,28,28] | 21539 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1097.33 | 205520896 | 65155029.33 | 130994229.33 | 99.20 | 1.05 | 187.29 | true | 0.991563;0.991680;0.991792;0.991673;0.991721 | 205520896;205520896;205520896;205520896;205520896 | 64855200;64223104;67489632;63802400;66386784 | 130372544;129140288;135687392;128268640;133469856 | |
59 | resnetv23_stage2_conv7_fwd | Convolution | [256,512,28,28] | 1553477.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2107.67 | 26358054912 | 141911669.33 | 67752576.00 | 24.90 | 125.72 | 12505.80 | false | 0.249040;0.248802;0.248850;0.248314;0.248777 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 66106624;66443072;66098496;75803328;70708032 | 141943808;141894944;141729472;156215936;141896256 | |
59 | resnetv23_stage2_conv7_fwd | Convolution | [256,512,28,28] | 1553477.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7338.67 | 5.90 | 0.00 | 0.00 | true | 0.059411;0.059393;0.059370;0.059388;0.059377 | 0;0;0;0;0 | 96;96;96;96;96 | 7424;15104;7168;7424;7168 | |
60 | resnetv23_stage2_batchnorm7_fwd | BatchNorm | [256,128,28,28] | 7213.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.00 | 162529280 | 102763360.00 | 102721056.00 | 83.60 | 0.79 | 625.11 | true | 0.835797;0.835737;0.835867;0.834468;0.835484 | 162529280;162529280;162529280;162529280;162529280 | 102751456;102692128;102717152;102725600;102720416 | 102763104;102763616;102764000;102763360;102762848 | |
61 | resnetv23_stage2_activation7 | Activation | [256,128,28,28] | 5439.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.33 | 51380224 | 102760800.00 | 102527914.67 | 97.50 | 0.25 | 190.77 | true | 0.975323;0.976068;0.974760;0.975233;0.975273 | 51380224;51380224;51380224;51380224;51380224 | 102761056;102760800;102760800;102760800;102760800 | 102525312;102531264;102534592;102524992;102527168 | |
62 | resnetv23_stage2_conv8_fwd | Convolution | [256,128,28,28] | 3558477.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 1174.00 | 14060093440 | 46322837.33 | 62271861.33 | 17.00 | 129.47 | 11976.23 | false | 0.169558;0.171686;0.167971;0.168045;0.171005 | 14060093440;14060093440;14060093440;14060093440;14060093440 | 46785920;47089184;42076384;45093408;48048096 | 62517984;63429856;57003552;60867744;64672288 | |
62 | resnetv23_stage2_conv8_fwd | Convolution | [256,128,28,28] | 3558477.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 358.33 | 739770368 | 103601696.00 | 142649301.33 | 47.90 | 3.00 | 2064.48 | true | 0.478900;0.479044;0.479106;0.478826;0.478341 | 739770368;739770368;739770368;739770368;739770368 | 103570208;103609248;103589920;103624544;103605920 | 142496864;142733632;142595008;142754528;142619264 | |
62 | resnetv23_stage2_conv8_fwd | Convolution | [256,128,28,28] | 3558477.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 334.00 | 704643072 | 142703946.67 | 110525674.67 | 47.50 | 2.78 | 2109.71 | true | 0.473892;0.474856;0.475300;0.474740;0.475920 | 704643072;704643072;704643072;704643072;704643072 | 142708736;142702400;142710976;142700704;142699712 | 110570208;110500352;110539584;110510944;110526496 | |
62 | resnetv23_stage2_conv8_fwd | Convolution | [256,128,28,28] | 3558477.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 108.00 | 369885184 | 595509.33 | 70808981.33 | 46.60 | 5.18 | 3424.86 | true | 0.466718;0.465345;0.465904;0.463821;0.465446 | 369885184;369885184;369885184;369885184;369885184 | 596832;595168;595680;595488;595360 | 70824000;70809408;70785088;70823104;70794432 | |
63 | resnetv23_stage2_batchnorm8_fwd | BatchNorm | [256,128,28,28] | 7311 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 261.00 | 162529280 | 102763872.00 | 103014869.33 | 83.50 | 0.79 | 622.72 | true | 0.835422;0.835197;0.835350;0.835883;0.834749 | 162529280;162529280;162529280;162529280;162529280 | 102778976;102763872;102763872;102763872;102763744 | 103014656;103017824;103012128;103018240;103002944 | |
64 | resnetv23_stage2_activation8 | Activation | [256,128,28,28] | 5448.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.33 | 51380224 | 102760800.00 | 102532544.00 | 97.60 | 0.25 | 190.06 | true | 0.973644;0.976076;0.975635;0.976426;0.975176 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102531712;102532480;102533696;102525312;102533440 | |
65 | resnetv23_stage2_conv9_fwd | Convolution | [256,128,28,28] | 1485048.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2117.33 | 26512195584 | 142528064.00 | 280597152.00 | 24.90 | 62.66 | 12521.50 | false | 0.248692;0.248700;0.248684;0.248710;0.248678 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 285418528;278233536;283537280;277434496;280020640 | 147380544;140342144;145469632;139520000;141772416 | |
65 | resnetv23_stage2_conv9_fwd | Convolution | [256,128,28,28] | 1485048.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7168.00 | 5.90 | 0.00 | 0.00 | true | 0.059373;0.059441;0.059395;0.059414;0.059398 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7424;7168;7168;7168 | |
66 | resnetv23_stage2__plus2 | elemwise_add | [256,512,28,28] | 26434.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1477.00 | 102760448 | 201488864.00 | 210060373.33 | 96.50 | 0.25 | 69.57 | true | 0.965290;0.965453;0.965137;0.965325;0.965157 | 102760448;102760448;102760448;102760448;102760448 | 199445536;199945120;208261344;203681344;200840128 | 202451584;210326304;210122560;210563424;209732256 | |
67 | resnetv23_stage2_batchnorm9_fwd | BatchNorm | [256,512,28,28] | 28858 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1036.00 | 650117120 | 61559861.33 | 123571893.33 | 84.60 | 3.51 | 627.53 | true | 0.846299;0.846505;0.846315;0.846269;0.846479 | 650117120;650117120;650117120;650117120;650117120 | 61581696;61098080;59861216;62404288;61999808 | 123607712;122636864;120172992;125261856;124471104 | |
68 | resnetv23_stage2_activation9 | Activation | [256,512,28,28] | 21495 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1098.33 | 205520896 | 65759424.00 | 132211957.33 | 99.20 | 1.04 | 187.12 | true | 0.991571;0.991701;0.991550;0.991375;0.991568 | 205520896;205520896;205520896;205520896;205520896 | 66644064;66672288;66140512;64493696;58431680 | 133986848;134041536;132954272;129694752;117588096 | |
69 | resnetv23_stage2_conv10_fwd | Convolution | [256,512,28,28] | 1557243 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2109.33 | 26358054912 | 144636949.33 | 67465098.67 | 24.90 | 124.27 | 12495.92 | false | 0.248791;0.249056;0.248637;0.249027;0.249026 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 144628512;143213152;151270784;141716384;146069184 | 69422720;66155136;75929600;66138240;66817440 | |
69 | resnetv23_stage2_conv10_fwd | Convolution | [256,512,28,28] | 1557243 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 7168.00 | 5.90 | 0.00 | 0.00 | true | 0.059403;0.059427;0.059361;0.059360;0.059380 | 0;0;0;0;0 | 96;96;96;96;96 | 7168;7168;7424;7168;7168 | |
70 | resnetv23_stage2_batchnorm10_fwd | BatchNorm | [256,128,28,28] | 7206.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 260.00 | 162529280 | 102763701.33 | 102715648.00 | 83.50 | 0.79 | 625.11 | true | 0.834959;0.835226;0.835067;0.835113;0.834454 | 162529280;162529280;162529280;162529280;162529280 | 102701856;102735776;102716512;102726240;102704192 | 102763616;102763872;102763616;102763488;102765152 | |
71 | resnetv23_stage2_activation10 | Activation | [256,128,28,28] | 5445 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.00 | 51380224 | 102760800.00 | 102533866.67 | 97.50 | 0.25 | 191.00 | true | 0.975289;0.975286;0.975345;0.975302;0.975471 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102534016;102523264;102534912;102532672;102537664 | |
72 | resnetv23_stage2_conv11_fwd | Convolution | [256,128,28,28] | 3551382 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 1172.00 | 14060093440 | 45350357.33 | 61247744.00 | 17.10 | 131.90 | 11996.67 | false | 0.167664;0.170574;0.172135;0.172052;0.169158 | 14060093440;14060093440;14060093440;14060093440;14060093440 | 64217216;57260928;61331264;58972096;63439872 | 48077120;42330752;45397760;43563456;47089856 | |
72 | resnetv23_stage2_conv11_fwd | Convolution | [256,128,28,28] | 3551382 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 0u, false>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 358.00 | 739770368 | 103588170.67 | 142617781.33 | 47.90 | 3.00 | 2066.40 | true | 0.479103;0.478086;0.478967;0.479126;0.478479 | 739770368;739770368;739770368;739770368;739770368 | 103584416;103608672;103570784;103571424;103612960 | 142516928;142697952;142491584;142692448;142643968 | |
72 | resnetv23_stage2_conv11_fwd | Convolution | [256,128,28,28] | 3551382 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_c2r_32x32<float, false, false, 0u, false, false>(float*, float2 const*, int, int, int, int, int, int, int, int, int, float, float, cudnn::reduced_divisor, bool, float*, float*, int2, int, int) | 333.33 | 704643072 | 142703050.67 | 110552853.33 | 47.50 | 2.78 | 2113.93 | true | 0.474034;0.474693;0.474483;0.475050;0.475185 | 704643072;704643072;704643072;704643072;704643072 | 142696992;142693984;142723456;142709216;142702944 | 110556800;110550592;110667072;110551168;110543968 | |
72 | resnetv23_stage2_conv11_fwd | Convolution | [256,128,28,28] | 3551382 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void fft2d_r2c_32x32<float, false, 1u, true>(float2*, float const*, int, int, int, int, int, int, int, int, int, cudnn::reduced_divisor, bool, int2, int, int) | 107.00 | 369885184 | 595680.00 | 70774421.33 | 46.60 | 5.18 | 3456.87 | true | 0.465793;0.467038;0.466018;0.466033;0.466219 | 369885184;369885184;369885184;369885184;369885184 | 595424;597600;595616;594848;596000 | 70814144;70776000;70775616;70753728;70771648 | |
73 | resnetv23_stage2_batchnorm11_fwd | BatchNorm | [256,128,28,28] | 7305.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 261.00 | 162529280 | 102764256.00 | 103010176.00 | 83.50 | 0.79 | 622.72 | true | 0.835274;0.835292;0.835460;0.834936;0.836050 | 162529280;162529280;162529280;162529280;162529280 | 102764896;102764384;102763616;102764512;102763872 | 103009184;103025376;102998848;103009952;103011392 | |
74 | resnetv23_stage2_activation11 | Activation | [256,128,28,28] | 5548.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 268.67 | 51380224 | 102760800.00 | 102532149.33 | 97.50 | 0.25 | 191.24 | true | 0.975194;0.975310;0.974588;0.974570;0.975116 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102529408;102529984;102539840;102530784;102535680 | |
75 | resnetv23_stage2_conv12_fwd | Convolution | [256,128,28,28] | 1481424 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2115.67 | 26512195584 | 141431029.33 | 280292714.67 | 24.90 | 62.87 | 12531.37 | false | 0.248687;0.248687;0.248681;0.248544;0.248663 | 26512195584;26512195584;26512195584;26512195584;26512195584 | 140027360;141173600;143002880;142219680;140899808 | 276358784;287292896;280006752;283064000;277807392 | |
75 | resnetv23_stage2_conv12_fwd | Convolution | [256,128,28,28] | 1481424 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 7264.00 | 5.90 | 0.00 | 0.00 | true | 0.059371;0.059385;0.059393;0.059780;0.059404 | 0;0;0;0;0 | 7168;7200;7424;7168;7424 | 96;96;96;96;96 | |
76 | resnetv23_stage2__plus3 | elemwise_add | [256,512,28,28] | 26298 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 1477.00 | 102760448 | 199956853.33 | 204928853.33 | 96.50 | 0.25 | 69.57 | true | 0.965200;0.965033;0.965147;0.964956;0.965072 | 102760448;102760448;102760448;102760448;102760448 | 199328672;200943072;199857664;199548832;200464064 | 209973056;202367360;209062688;202589696;203134176 | |
77 | resnetv23_stage3_batchnorm0_fwd | BatchNorm | [256,512,28,28] | 28827 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 1037.67 | 650117120 | 62949589.33 | 126378794.67 | 84.70 | 3.43 | 626.52 | true | 0.846757;0.846728;0.846786;0.846903;0.846858 | 650117120;650117120;650117120;650117120;650117120 | 61060192;63058720;63472416;63162784;62627264 | 122620064;126600320;127382528;126836576;125699488 | |
78 | resnetv23_stage3_activation0 | Activation | [256,512,28,28] | 21521 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 1103.67 | 205520896 | 65910976.00 | 132523328.00 | 99.20 | 1.04 | 186.22 | true | 0.991404;0.991851;0.991647;0.991396;0.991640 | 205520896;205520896;205520896;205520896;205520896 | 70848736;65832960;65715968;66184000;63690208 | 142343808;132336704;132167904;133065376;128043552 | |
79 | resnetv23_stage3_conv0_fwd | Convolution | [256,512,28,28] | 2869044.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 4163.67 | 52716109824 | 142158826.67 | 87057141.33 | 17.50 | 229.98 | 12660.98 | false | 0.175325;0.174371;0.174359;0.175607;0.176303 | 52716109824;52716109824;52716109824;52716109824;52716109824 | 87056288;87114336;87038528;87045504;87069632 | 141750272;142210720;141790528;142482240;142475232 | |
79 | resnetv23_stage3_conv0_fwd | Convolution | [256,512,28,28] | 2869044.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 7296.00 | 5.90 | 0.00 | 0.00 | true | 0.059394;0.059390;0.059372;0.059537;0.059353 | 0;0;0;0;0 | 96;96;96;96;96 | 21248;7424;7168;7168;7296 | |
80 | resnetv23_stage3_batchnorm1_fwd | BatchNorm | [256,256,28,28] | 14217.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 517.67 | 325058560 | 89916864.00 | 90852768.00 | 84.20 | 1.80 | 627.93 | true | 0.841665;0.841550;0.841936;0.841666;0.841346 | 325058560;325058560;325058560;325058560;325058560 | 83399776;89775488;89759264;93023552;102622656 | 86705280;89917088;86705536;93127968;99551264 | |
81 | resnetv23_stage3_activation1 | Activation | [256,256,28,28] | 10802.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 539.00 | 102760448 | 64226432.00 | 64152373.33 | 98.70 | 0.80 | 190.65 | true | 0.986871;0.987260;0.987061;0.986759;0.987241 | 102760448;102760448;102760448;102760448;102760448 | 76984032;64153280;60946592;57740288;67357248 | 80281952;64228256;61014080;54591904;67436960 | |
82 | resnetv23_stage3_conv1_fwd | Convolution | [256,256,28,28] | 3413785.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_small_nn_v1 | 4675.67 | 59215708160 | 67374090.67 | 20237280.00 | 14.70 | 675.89 | 12664.65 | false | 0.146490;0.144873;0.147412;0.145925;0.148282 | 59215708160;59215708160;59215708160;59215708160;59215708160 | 20244192;20293504;20177824;20237120;20230528 | 67580608;68644064;67389824;67066400;67151840 | |
82 | resnetv23_stage3_conv1_fwd | Convolution | [256,256,28,28] | 3413785.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 181.33 | 2688.00 | 5.80 | 0.00 | 0.00 | true | 0.058073;0.058230;0.058034;0.057939;0.058072 | 0;0;0;0;0 | 352;96;5984;96;96 | 2944;2688;14208;2432;2432 | |
83 | resnetv23_stage3_batchnorm2_fwd | BatchNorm | [256,256,14,14] | 6260.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51974986.67 | 55410197.33 | 88.10 | 0.76 | 568.28 | true | 0.880811;0.880635;0.881793;0.879654;0.881751 | 81264640;81264640;81264640;81264640;81264640 | 51974944;51967392;51974624;52004224;51975392 | 55329088;55382944;55367584;55480064;55488864 | |
84 | resnetv23_stage3_activation2 | Activation | [256,256,14,14] | 2766 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380832.00 | 51374560.00 | 95.70 | 0.25 | 188.90 | true | 0.955113;0.957377;0.956393;0.957840;0.956808 | 25690112;25690112;25690112;25690112;25690112 | 51380832;51380832;51380864;51380832;51380832 | 51376352;51368896;51368000;51386400;51378432 | |
85 | resnetv23_stage3_conv2_fwd | Convolution | [256,256,14,14] | 1432153 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2083.33 | 26409435136 | 154778058.67 | 152971680.00 | 24.90 | 85.81 | 12676.53 | false | 0.248986;0.248918;0.248986;0.248886;0.249021 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 154811040;153234368;150911232;154137440;151543232 | 154204992;157888032;155467872;154118304;154661312 | |
85 | resnetv23_stage3_conv2_fwd | Convolution | [256,256,14,14] | 1432153 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 352.00 | 2730.67 | 5.80 | 0.00 | 0.00 | true | 0.058090;0.058066;0.058058;0.057951;0.058053 | 0;0;0;0;0 | 352;352;352;352;352 | 2816;8448;2688;2688;2688 | |
86 | resnetv23_stage3_conv3_fwd | Convolution | [256,512,28,28] | 2874140.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x128_relu_interior_nn_v1 | 4264.33 | 52716109824 | 205050858.67 | 92150080.00 | 16.90 | 177.38 | 12362.10 | false | 0.168920;0.169063;0.168837;0.172092;0.170324 | 52716109824;52716109824;52716109824;52716109824;52716109824 | 206076576;202909344;205808320;203267680;206690464 | 92465600;92489632;92894304;90857088;91495008 | |
86 | resnetv23_stage3_conv3_fwd | Convolution | [256,512,28,28] | 2874140.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 106.67 | 1173.33 | 5.80 | 0.00 | 0.00 | true | 0.058046;0.058034;0.057977;0.057889;0.057991 | 0;0;0;0;0 | 928;1376;1152;992;1376 | 96;96;128;96;128 | |
87 | resnetv23_stage3__plus0 | elemwise_add | [256,1024,14,14] | 13130 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.33 | 51380224 | 34389.33 | 64885.33 | 95.70 | 517.56 | 69.40 | false | 0.956861;0.957040;0.956705;0.956670;0.957173 | 51380224;51380224;51380224;51380224;51380224 | 192;89600;5286080;4992;8576 | 1024;191264;5850752;2112;1280 | |
88 | resnetv23_stage3_batchnorm3_fwd | BatchNorm | [256,1024,14,14] | 25272 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 58288448.00 | 61630517.33 | 90.00 | 2.71 | 585.69 | true | 0.900478;0.900186;0.900223;0.900298;0.900360 | 325058560;325058560;325058560;325058560;325058560 | 52323040;69773280;55815904;90703072;59302368 | 48578560;67996384;51823584;84193024;55045376 | |
89 | resnetv23_stage3_activation3 | Activation | [256,1024,14,14] | 10800.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.00 | 102760448 | 77070805.33 | 78137845.33 | 98.70 | 0.66 | 189.59 | true | 0.987058;0.987306;0.986433;0.987323;0.987403 | 102760448;102760448;102760448;102760448;102760448 | 67436608;83493280;73859648;73859488;83493280 | 67432608;83488768;73858496;77066272;83490112 | |
90 | resnetv23_stage3_conv4_fwd | Convolution | [256,1024,14,14] | 1475595.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2097.33 | 26332364800 | 161247658.67 | 37703253.33 | 24.70 | 132.36 | 12555.17 | false | 0.247370;0.247627;0.247489;0.247680;0.247373 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 156872320;171435136;166529024;160341632;154838944 | 37819008;37648832;36455008;37658912;37802016 | |
90 | resnetv23_stage3_conv4_fwd | Convolution | [256,1024,14,14] | 1475595.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058070;0.058061;0.058757;0.057946;0.058075 | 0;0;0;0;0 | 2432;2432;2432;2432;2688 | 96;96;96;96;96 | |
91 | resnetv23_stage3_batchnorm4_fwd | BatchNorm | [256,256,14,14] | 6359.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52038890.67 | 55726528.00 | 88.20 | 0.75 | 565.65 | true | 0.882473;0.882051;0.883264;0.882367;0.882139 | 81264640;81264640;81264640;81264640;81264640 | 52037728;52039936;52039008;52071392;52037728 | 55732256;55691264;55756064;55689440;55777376 | |
92 | resnetv23_stage3_activation4 | Activation | [256,256,14,14] | 2765 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51368426.67 | 95.70 | 0.25 | 188.44 | true | 0.958139;0.957081;0.957673;0.956890;0.956663 | 25690112;25690112;25690112;25690112;25690112 | 51371904;51364192;51369184;51377568;51356576 | 51380576;51380576;51380576;51380576;51380576 | |
93 | resnetv23_stage3_conv5_fwd | Convolution | [256,256,14,14] | 3394861.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1502.00 | 19365101568 | 86863605.33 | 76758698.67 | 24.70 | 118.35 | 12892.88 | false | 0.246633;0.246804;0.246937;0.246728;0.246916 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 76952416;70979424;76723008;77341504;76600672 | 84977088;91261440;79868352;86218592;89395136 | |
93 | resnetv23_stage3_conv5_fwd | Convolution | [256,256,14,14] | 3394861.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.00 | 251658240 | 51359904.00 | 150603125.33 | 47.10 | 1.25 | 911.81 | true | 0.470927;0.471674;0.471689;0.470862;0.472219 | 251658240;251658240;251658240;251658240;251658240 | 51396768;51352224;51338272;51381344;51346144 | 150594496;150597056;150630336;150617824;150588096 | |
93 | resnetv23_stage3_conv5_fwd | Convolution | [256,256,14,14] | 3394861.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 261.67 | 269484032 | 151415882.67 | 54414538.67 | 48.30 | 1.31 | 1029.87 | true | 0.482980;0.483577;0.482372;0.483259;0.483156 | 269484032;269484032;269484032;269484032;269484032 | 151414112;151422784;151413856;151396064;151419680 | 54462944;54322912;54433568;54371008;54439040 | |
93 | resnetv23_stage3_conv5_fwd | Convolution | [256,256,14,14] | 3394861.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.67 | 5308416 | 2365440.00 | 9320416.00 | 32.20 | 0.45 | 300.47 | true | 0.320448;0.324463;0.313824;0.327219;0.319645 | 5308416;5308416;5308416;5308416;5308416 | 2365440;2365440;2365440;2365440;2365440 | 9330944;9324032;9306272;9296512;9331360 | |
94 | resnetv23_stage3_batchnorm5_fwd | BatchNorm | [256,256,14,14] | 6313.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51763530.67 | 55945098.67 | 88.20 | 0.75 | 568.28 | true | 0.881154;0.882373;0.881878;0.882682;0.881935 | 81264640;81264640;81264640;81264640;81264640 | 51780864;51768000;51751328;51760384;51762208 | 55944896;55941664;55918816;55968480;55948736 | |
95 | resnetv23_stage3_activation5 | Activation | [256,256,14,14] | 2739.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51366986.67 | 95.70 | 0.25 | 187.98 | true | 0.956298;0.957403;0.956327;0.958126;0.958220 | 25690112;25690112;25690112;25690112;25690112 | 51362368;51372992;51375392;51361472;51365600 | 51380576;51380608;51380576;51380576;51380576 | |
96 | resnetv23_stage3_conv6_fwd | Convolution | [256,256,14,14] | 1421372.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2077.00 | 26409435136 | 154006848.00 | 151528000.00 | 24.90 | 86.44 | 12715.18 | false | 0.249018;0.249003;0.248642;0.249010;0.248975 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 157501824;153665280;152584896;153235296;155119968 | 151901984;151478944;151203072;150583488;153738016 | |
96 | resnetv23_stage3_conv6_fwd | Convolution | [256,256,14,14] | 1421372.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058061;0.058085;0.058097;0.057899;0.058071 | 0;0;0;0;0 | 6752;96;96;96;96 | 16512;2432;2688;2432;2432 | |
97 | resnetv23_stage3__plus1 | elemwise_add | [256,1024,14,14] | 13195.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.67 | 51380224 | 3941898.67 | 4526304.00 | 95.70 | 6.07 | 69.37 | true | 0.956501;0.956529;0.956317;0.956629;0.956724 | 51380224;51380224;51380224;51380224;51380224 | 6028384;804064;4067776;2959392;4798528 | 6644224;1367712;4650400;3532320;5396192 | |
98 | resnetv23_stage3_batchnorm6_fwd | BatchNorm | [256,1024,14,14] | 25008 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 55048885.33 | 58149450.67 | 90.00 | 2.87 | 585.69 | true | 0.900158;0.900419;0.900743;0.900432;0.900172 | 325058560;325058560;325058560;325058560;325058560 | 55831072;55812160;62796960;55820320;69791200 | 48574016;51815552;61520928;51810176;61523232 | |
99 | resnetv23_stage3_activation6 | Activation | [256,1024,14,14] | 10790 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.00 | 102760448 | 66372917.33 | 66363104.00 | 98.70 | 0.77 | 189.95 | true | 0.987107;0.986755;0.987111;0.986977;0.986721 | 102760448;102760448;102760448;102760448;102760448 | 73878464;77070752;61014592;51380640;64225696 | 77070496;77064480;57801600;51380640;64223232 | |
100 | resnetv23_stage3_conv7_fwd | Convolution | [256,1024,14,14] | 1477386.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.67 | 26332364800 | 157649792.00 | 37784032.00 | 24.80 | 134.74 | 12565.15 | false | 0.247225;0.247845;0.247751;0.247777;0.247427 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 157190368;156249344;163276768;157041824;158717184 | 37805120;37741888;37815456;37775776;37771200 | |
100 | resnetv23_stage3_conv7_fwd | Convolution | [256,1024,14,14] | 1477386.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.058075;0.058089;0.058046;0.057946;0.058329 | 0;0;0;0;0 | 96;96;5472;96;96 | 2688;2432;13440;2688;2432 | |
101 | resnetv23_stage3_batchnorm7_fwd | BatchNorm | [256,256,14,14] | 6273.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52040352.00 | 55736469.33 | 88.30 | 0.75 | 568.28 | true | 0.882575;0.882644;0.882616;0.883165;0.882288 | 81264640;81264640;81264640;81264640;81264640 | 52044288;52055808;52019936;52041056;52035712 | 55738656;55718432;55800800;55752320;55708704 | |
102 | resnetv23_stage3_activation7 | Activation | [256,256,14,14] | 2753.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 137.33 | 25690112 | 51380576.00 | 51373301.33 | 95.70 | 0.25 | 187.06 | true | 0.956545;0.958018;0.957659;0.957186;0.956035 | 25690112;25690112;25690112;25690112;25690112 | 51380640;51380576;51380576;51380576;51380576 | 51379392;51364224;51365536;51385824;51374976 | |
103 | resnetv23_stage3_conv8_fwd | Convolution | [256,256,14,14] | 3400134.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1504.67 | 19365101568 | 88244053.33 | 77099114.67 | 24.70 | 117.12 | 12870.02 | false | 0.246729;0.246879;0.247092;0.246465;0.247002 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 90007264;89108288;89919648;85704224;79967712 | 77913760;77650880;75802016;77844448;75344448 | |
103 | resnetv23_stage3_conv8_fwd | Convolution | [256,256,14,14] | 3400134.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 277.33 | 251658240 | 51386250.67 | 150578506.67 | 47.10 | 1.25 | 907.42 | true | 0.471724;0.470927;0.470769;0.471362;0.470985 | 251658240;251658240;251658240;251658240;251658240 | 51379872;51389472;51387552;51391392;51381728 | 150574016;150647744;150565216;150542144;150596288 | |
103 | resnetv23_stage3_conv8_fwd | Convolution | [256,256,14,14] | 3400134.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.33 | 269484032 | 151342720.00 | 54190293.33 | 48.20 | 1.31 | 1035.15 | true | 0.482724;0.482532;0.482191;0.482914;0.481798 | 269484032;269484032;269484032;269484032;269484032 | 151364864;151344896;151345056;151334752;151338208 | 54260384;54108160;54018176;54236544;54226176 | |
103 | resnetv23_stage3_conv8_fwd | Convolution | [256,256,14,14] | 3400134.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2360064.00 | 9353472.00 | 30.90 | 0.45 | 325.01 | true | 0.308030;0.307976;0.310976;0.318500;0.304811 | 5308416;5308416;5308416;5308416;5308416 | 2360064;2360064;2360064;2360064;2360064 | 9398688;9283584;9376384;9378304;9305728 | |
104 | resnetv23_stage3_batchnorm8_fwd | BatchNorm | [256,256,14,14] | 6329.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51771840.00 | 55891221.33 | 88.20 | 0.75 | 568.28 | true | 0.882872;0.882056;0.881406;0.881947;0.882751 | 81264640;81264640;81264640;81264640;81264640 | 51777856;51763360;51764736;51772928;51793088 | 55891168;55866464;55872288;55910208;55929184 | |
105 | resnetv23_stage3_activation8 | Activation | [256,256,14,14] | 2752.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51372960.00 | 95.70 | 0.25 | 188.44 | true | 0.957026;0.956804;0.958843;0.958199;0.957027 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51377504;51369248;51378144;51372128;51365184 | |
106 | resnetv23_stage3_conv9_fwd | Convolution | [256,256,14,14] | 1416510 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2078.33 | 26409435136 | 154511861.33 | 152238901.33 | 24.90 | 86.09 | 12707.03 | false | 0.249007;0.249018;0.248995;0.249007;0.248982 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 154670656;159099712;154262784;154602144;153477632 | 152037888;154206976;150839840;153838976;150563264 | |
106 | resnetv23_stage3_conv9_fwd | Convolution | [256,256,14,14] | 1416510 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058092;0.058089;0.058313;0.057938;0.058070 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2432;2432;2432 | |
107 | resnetv23_stage3__plus2 | elemwise_add | [256,1024,14,14] | 13251.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 2722122.67 | 3088074.67 | 95.70 | 8.84 | 69.34 | true | 0.956968;0.956732;0.957115;0.957105;0.956929 | 51380224;51380224;51380224;51380224;51380224 | 5764800;128;160;6030400;2401408 | 6314656;384;576;6645792;2948992 | |
108 | resnetv23_stage3_batchnorm9_fwd | BatchNorm | [256,1024,14,14] | 25192.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 553.67 | 325058560 | 53975221.33 | 56979818.67 | 90.00 | 2.93 | 587.10 | true | 0.899910;0.899879;0.900250;0.900066;0.900048 | 325058560;325058560;325058560;325058560;325058560 | 61529472;55060800;51807360;48579744;55057504 | 62799712;59305024;55814656;52335008;55819776 | |
109 | resnetv23_stage3_activation9 | Activation | [256,1024,14,14] | 11115.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.33 | 102760448 | 65298730.67 | 66363104.00 | 98.70 | 0.78 | 189.83 | true | 0.987560;0.987149;0.987507;0.987459;0.986684 | 102760448;102760448;102760448;102760448;102760448 | 61014144;73859488;61014592;73859488;61022112 | 64221568;70645120;64222624;73866368;61012608 | |
110 | resnetv23_stage3_conv10_fwd | Convolution | [256,1024,14,14] | 1479344.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2094.33 | 26332364800 | 157991018.67 | 37752490.67 | 24.80 | 134.52 | 12573.15 | false | 0.247827;0.247333;0.248347;0.248535;0.247489 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 158716704;159574880;161603776;155442336;155681472 | 37751712;37351872;37787776;37786240;37719520 | |
110 | resnetv23_stage3_conv10_fwd | Convolution | [256,1024,14,14] | 1479344.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058055;0.058065;0.058644;0.057893;0.058080 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2688;2432;2432 | |
111 | resnetv23_stage3_batchnorm10_fwd | BatchNorm | [256,256,14,14] | 6353.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52048768.00 | 55752064.00 | 88.30 | 0.75 | 565.65 | true | 0.883000;0.882988;0.882458;0.882299;0.883020 | 81264640;81264640;81264640;81264640;81264640 | 52049216;52063360;52051008;52023936;52046080 | 55758176;55727360;55767680;55763776;55734240 | |
112 | resnetv23_stage3_activation10 | Activation | [256,256,14,14] | 2754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51369194.67 | 95.60 | 0.25 | 188.90 | true | 0.955217;0.957980;0.956414;0.955794;0.956815 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380608;51380576 | 51374752;51374304;51361824;51371456;51361088 | |
113 | resnetv23_stage3_conv11_fwd | Convolution | [256,256,14,14] | 3397769 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1505.00 | 19365101568 | 83635072.00 | 77491530.67 | 24.70 | 120.19 | 12867.18 | false | 0.246874;0.246725;0.246664;0.246889;0.246971 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 77083296;77270080;77412288;78035776;77792224 | 85519904;85504032;82013792;83106304;82294880 | |
113 | resnetv23_stage3_conv11_fwd | Convolution | [256,256,14,14] | 3397769 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.33 | 251658240 | 51380938.67 | 150583797.33 | 47.10 | 1.25 | 910.71 | true | 0.471644;0.471292;0.471655;0.470953;0.471137 | 251658240;251658240;251658240;251658240;251658240 | 51382432;51377952;51409696;51382432;51374112 | 150584512;150549728;150628832;150613696;150553184 | |
113 | resnetv23_stage3_conv11_fwd | Convolution | [256,256,14,14] | 3397769 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 258.00 | 269484032 | 151384128.00 | 54335861.33 | 48.30 | 1.31 | 1044.51 | true | 0.482116;0.482774;0.482381;0.483993;0.482530 | 269484032;269484032;269484032;269484032;269484032 | 151383232;151384896;151398112;151384256;151354944 | 54351680;54194560;54375424;54280480;54479360 | |
113 | resnetv23_stage3_conv11_fwd | Convolution | [256,256,14,14] | 3397769 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.00 | 5308416 | 2359552.00 | 9306325.33 | 30.00 | 0.46 | 331.78 | true | 0.315086;0.300447;0.299283;0.299920;0.300955 | 5308416;5308416;5308416;5308416;5308416 | 2360320;2359552;2359552;2359552;2359552 | 9268608;9341312;9305344;9272320;9356960 | |
114 | resnetv23_stage3_batchnorm11_fwd | BatchNorm | [256,256,14,14] | 6428 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51767680.00 | 55940725.33 | 88.20 | 0.75 | 568.28 | true | 0.882312;0.882003;0.882334;0.882440;0.881444 | 81264640;81264640;81264640;81264640;81264640 | 51768384;51763424;51770784;51763872;51781216 | 55944192;55941568;55952000;55936416;55893664 | |
115 | resnetv23_stage3_activation11 | Activation | [256,256,14,14] | 2781.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51373696.00 | 95.70 | 0.25 | 187.98 | true | 0.957144;0.955349;0.958306;0.957612;0.957692 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380672;51380576 | 51369152;51376672;51375264;51367040;51379584 | |
116 | resnetv23_stage3_conv12_fwd | Convolution | [256,256,14,14] | 1423774 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2080.33 | 26409435136 | 155975552.00 | 152112074.67 | 24.90 | 85.72 | 12694.81 | false | 0.248709;0.249029;0.249055;0.248943;0.248973 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 154466176;155136544;158323936;159101696;154452672 | 151200608;151476512;152673600;154024448;152186112 | |
116 | resnetv23_stage3_conv12_fwd | Convolution | [256,256,14,14] | 1423774 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058065;0.058070;0.058092;0.058055;0.058097 | 0;0;0;0;0 | 1120;96;96;96;96 | 4480;2432;2432;2432;2432 | |
117 | resnetv23_stage3__plus3 | elemwise_add | [256,1024,14,14] | 13275.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.00 | 51380224 | 2761290.67 | 3350058.67 | 95.70 | 8.41 | 69.43 | true | 0.956263;0.956658;0.956627;0.956716;0.956552 | 51380224;51380224;51380224;51380224;51380224 | 2874976;4742208;128;666688;4869024 | 3519136;5318976;352;1212064;5475520 | |
118 | resnetv23_stage3_batchnorm12_fwd | BatchNorm | [256,1024,14,14] | 25134.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 59365962.67 | 63959541.33 | 90.00 | 2.64 | 586.40 | true | 0.900550;0.900349;0.899854;0.900408;0.900723 | 325058560;325058560;325058560;325058560;325058560 | 55816800;69782240;104674848;62792768;59303616 | 48573216;64763392;97148608;58288512;55045984 | |
119 | resnetv23_stage3_activation12 | Activation | [256,1024,14,14] | 10782 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 542.33 | 102760448 | 68507381.33 | 68505418.67 | 98.70 | 0.75 | 189.48 | true | 0.986432;0.987156;0.987433;0.986779;0.987486 | 102760448;102760448;102760448;102760448;102760448 | 70648224;64225696;109183328;64225696;70648224 | 70646304;64220896;109176480;64224064;70645888 | |
120 | resnetv23_stage3_conv13_fwd | Convolution | [256,1024,14,14] | 1475737 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.67 | 26332364800 | 160882570.67 | 37784842.67 | 24.70 | 132.54 | 12559.15 | false | 0.246937;0.247661;0.247303;0.247606;0.247417 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 162480384;156946720;163220608;167767104;156721856 | 37761856;37831584;37769824;37501312;37822848 | |
120 | resnetv23_stage3_conv13_fwd | Convolution | [256,1024,14,14] | 1475737 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 1802.67 | 5930.67 | 5.80 | 0.00 | 0.00 | true | 0.058117;0.058095;0.058072;0.057956;0.058048 | 0;0;0;0;0 | 96;5472;96;5216;96 | 2432;12928;2432;12928;2432 | |
121 | resnetv23_stage3_batchnorm13_fwd | BatchNorm | [256,256,14,14] | 6304.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 52042698.67 | 55726549.33 | 88.20 | 0.75 | 564.34 | true | 0.882294;0.882660;0.881947;0.882292;0.882023 | 81264640;81264640;81264640;81264640;81264640 | 52052704;52032128;52036640;52038752;52057696 | 55683776;55794080;55744192;55724512;55710944 | |
122 | resnetv23_stage3_activation13 | Activation | [256,256,14,14] | 2755.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51381269.33 | 51375125.33 | 95.70 | 0.25 | 188.90 | true | 0.957410;0.957153;0.956795;0.956818;0.957882 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51382624;51380576;51380608;51387232 | 51374016;51373952;51373728;51377408;51391136 | |
123 | resnetv23_stage3_conv14_fwd | Convolution | [256,256,14,14] | 3390433.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.00 | 19365101568 | 82141472.00 | 76701013.33 | 24.70 | 121.91 | 12927.30 | false | 0.246711;0.246761;0.246702;0.246555;0.246531 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 87335488;79889056;80320640;83369120;82734656 | 74190432;77998432;76339008;77391360;76372672 | |
123 | resnetv23_stage3_conv14_fwd | Convolution | [256,256,14,14] | 3390433.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.67 | 251658240 | 51375712.00 | 150558240.00 | 47.10 | 1.25 | 909.61 | true | 0.472049;0.471105;0.470873;0.471293;0.471539 | 251658240;251658240;251658240;251658240;251658240 | 150598848;150545888;150529984;150620992;150519904 | 51392864;51367328;51357024;51366944;51401504 | |
123 | resnetv23_stage3_conv14_fwd | Convolution | [256,256,14,14] | 3390433.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.67 | 269484032 | 151332533.33 | 54205877.33 | 48.20 | 1.31 | 1033.82 | true | 0.482032;0.482566;0.482173;0.481955;0.482900 | 269484032;269484032;269484032;269484032;269484032 | 151345120;151303296;151360672;151349184;151255616 | 54067936;54098016;54254400;54265216;54298336 | |
123 | resnetv23_stage3_conv14_fwd | Convolution | [256,256,14,14] | 3390433.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9354165.33 | 30.50 | 0.45 | 325.01 | true | 0.305900;0.304301;0.304484;0.308804;0.303232 | 5308416;5308416;5308416;5308416;5308416 | 9334400;9399552;9375136;9322112;9352960 | 2360064;2359552;2359552;2359552;2359552 | |
124 | resnetv23_stage3_batchnorm14_fwd | BatchNorm | [256,256,14,14] | 6300 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51771978.67 | 55896469.33 | 88.20 | 0.75 | 568.28 | true | 0.881913;0.882354;0.881701;0.881514;0.881869 | 81264640;81264640;81264640;81264640;81264640 | 51772160;51767008;51752800;51783840;51776768 | 55889088;55895232;55871200;55906720;55905088 | |
125 | resnetv23_stage3_activation14 | Activation | [256,256,14,14] | 2774.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51377269.33 | 95.70 | 0.25 | 188.44 | true | 0.957890;0.956390;0.957462;0.957081;0.957021 | 25690112;25690112;25690112;25690112;25690112 | 51379776;51374688;51375584;51376704;51379520 | 51380576;51380608;51380576;51380576;51380576 | |
126 | resnetv23_stage3_conv15_fwd | Convolution | [256,256,14,14] | 1413897 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2079.00 | 26409435136 | 155342176.00 | 151682517.33 | 24.90 | 86.02 | 12702.95 | false | 0.248940;0.248947;0.248992;0.248847;0.248789 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 153933728;155804288;157664192;153422400;156288512 | 150480512;152379328;152187712;150217312;154184736 | |
126 | resnetv23_stage3_conv15_fwd | Convolution | [256,256,14,14] | 1413897 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2816.00 | 5.80 | 0.00 | 0.00 | true | 0.058066;0.058624;0.058068;0.057940;0.058034 | 0;0;0;0;0 | 96;96;96;352;96 | 2816;2432;2432;3200;12928 | |
127 | resnetv23_stage3__plus4 | elemwise_add | [256,1024,14,14] | 13247.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 4287306.67 | 4870250.67 | 95.70 | 5.61 | 69.34 | true | 0.957214;0.957076;0.956946;0.956812;0.957112 | 51380224;51380224;51380224;51380224;51380224 | 4192000;989344;5283328;6055808;5135424 | 3605056;517952;4688096;5438432;4568768 | |
128 | resnetv23_stage3_batchnorm15_fwd | BatchNorm | [256,1024,14,14] | 25089.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 57207648.00 | 61638986.67 | 90.00 | 2.74 | 585.69 | true | 0.899957;0.900346;0.900301;0.900246;0.900210 | 325058560;325058560;325058560;325058560;325058560 | 48577856;51813248;61529600;58280096;64776160 | 48845856;55827904;66297824;62791232;69800800 | |
129 | resnetv23_stage3_activation15 | Activation | [256,1024,14,14] | 10778 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.67 | 102760448 | 68507168.00 | 67432426.67 | 98.70 | 0.76 | 189.71 | true | 0.987522;0.987375;0.987133;0.986574;0.987204 | 102760448;102760448;102760448;102760448;102760448 | 67436672;67436960;70647872;64225696;70648224 | 67430112;64220896;70643520;64223648;70648704 | |
130 | resnetv23_stage3_conv16_fwd | Convolution | [256,1024,14,14] | 1474688.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2098.33 | 26332364800 | 159593269.33 | 37767573.33 | 24.80 | 133.42 | 12549.18 | false | 0.247858;0.247953;0.247169;0.247736;0.247308 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 155772704;158012672;161150208;160527616;160239520 | 37709760;37808064;37765536;37856352;37729120 | |
130 | resnetv23_stage3_conv16_fwd | Convolution | [256,1024,14,14] | 1474688.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2560.00 | 5.80 | 0.00 | 0.00 | true | 0.058101;0.058068;0.058095;0.057880;0.058103 | 0;0;0;0;0 | 12928;2432;2432;2432;2816 | 5216;96;96;96;96 | |
131 | resnetv23_stage3_batchnorm16_fwd | BatchNorm | [256,256,14,14] | 6325.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52051872.00 | 55719317.33 | 88.20 | 0.75 | 565.65 | true | 0.882469;0.882068;0.883277;0.882532;0.882469 | 81264640;81264640;81264640;81264640;81264640 | 52043136;52062464;52041952;52054144;52058336 | 55724512;55676096;55770048;55713952;55719488 | |
132 | resnetv23_stage3_activation16 | Activation | [256,256,14,14] | 2757.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51367264.00 | 95.70 | 0.25 | 187.98 | true | 0.957118;0.956035;0.956116;0.957277;0.958220 | 25690112;25690112;25690112;25690112;25690112 | 51364192;51372032;51371968;51360896;51365632 | 51380576;51380608;51380576;51380576;51380576 | |
133 | resnetv23_stage3_conv17_fwd | Convolution | [256,256,14,14] | 3392477 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.67 | 19365101568 | 83083210.67 | 77549045.33 | 24.70 | 120.56 | 12921.55 | false | 0.246654;0.246904;0.246571;0.246680;0.246629 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 85526336;76447904;82688896;81034400;87804480 | 79274944;77900672;77488512;77257952;76952576 | |
133 | resnetv23_stage3_conv17_fwd | Convolution | [256,256,14,14] | 3392477 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.33 | 251658240 | 51381856.00 | 150619946.67 | 47.10 | 1.25 | 910.71 | true | 0.470515;0.471059;0.471178;0.470891;0.470308 | 251658240;251658240;251658240;251658240;251658240 | 51392864;51373216;51400800;51379488;51367776 | 150556128;150804800;150638560;150665152;150552544 | |
133 | resnetv23_stage3_conv17_fwd | Convolution | [256,256,14,14] | 3392477 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 259.67 | 269484032 | 151383808.00 | 54339082.67 | 48.20 | 1.31 | 1037.81 | true | 0.482007;0.482377;0.482407;0.482285;0.482079 | 269484032;269484032;269484032;269484032;269484032 | 151417664;151381184;151389856;151372640;151380384 | 54432480;54298848;54473568;54235456;54285920 | |
133 | resnetv23_stage3_conv17_fwd | Convolution | [256,256,14,14] | 3392477 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9317600.00 | 30.50 | 0.45 | 318.50 | true | 0.302871;0.300786;0.305221;0.314677;0.306252 | 5308416;5308416;5308416;5308416;5308416 | 9359488;9149440;9348096;9251360;9353344 | 2359552;2359552;2359552;2359552;2359552 | |
134 | resnetv23_stage3_batchnorm17_fwd | BatchNorm | [256,256,14,14] | 6374.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 51756629.33 | 55942090.67 | 88.30 | 0.75 | 566.96 | true | 0.882685;0.883002;0.882504;0.882519;0.881912 | 81264640;81264640;81264640;81264640;81264640 | 51755360;51753184;51767616;51748736;51761344 | 55975744;55962080;55938656;55905088;55925536 | |
135 | resnetv23_stage3_activation17 | Activation | [256,256,14,14] | 2746.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380586.67 | 51373301.33 | 95.80 | 0.25 | 188.90 | true | 0.957487;0.956323;0.957514;0.958933;0.957876 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51395968;51380608;51380576 | 51371552;51372096;51380736;51366528;51376256 | |
136 | resnetv23_stage3_conv18_fwd | Convolution | [256,256,14,14] | 1421271.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2081.00 | 26409435136 | 157848032.00 | 154019552.00 | 24.90 | 84.68 | 12690.74 | false | 0.248933;0.249028;0.249066;0.249001;0.248822 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 158226912;154339872;160267840;171571136;155049344 | 155850752;150766752;154286816;157237504;151921088 | |
136 | resnetv23_stage3_conv18_fwd | Convolution | [256,256,14,14] | 1421271.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058087;0.058046;0.058706;0.057903;0.058060 | 0;0;0;0;0 | 2432;2688;2688;2432;2432 | 96;96;96;96;96 | |
137 | resnetv23_stage3__plus5 | elemwise_add | [256,1024,14,14] | 13202.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 2219648.00 | 2793877.33 | 95.70 | 10.25 | 69.34 | true | 0.956723;0.956761;0.956410;0.956474;0.956544 | 51380224;51380224;51380224;51380224;51380224 | 845952;1716224;128;5643904;4096768 | 1423200;2294336;768;6180128;4664096 | |
138 | resnetv23_stage3_batchnorm18_fwd | BatchNorm | [256,1024,14,14] | 25611.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 57156768.00 | 61624256.00 | 90.20 | 2.74 | 586.40 | true | 0.902020;0.901904;0.902044;0.901927;0.902105 | 325058560;325058560;325058560;325058560;325058560 | 51759424;51762784;64705792;58235648;61471872 | 55805408;55802880;69782400;62787648;66279712 | |
139 | resnetv23_stage3_activation18 | Activation | [256,1024,14,14] | 10888.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.67 | 102760448 | 67438218.67 | 66361525.33 | 98.70 | 0.77 | 190.06 | true | 0.987076;0.987258;0.987158;0.987216;0.987130 | 102760448;102760448;102760448;102760448;102760448 | 64221824;80281920;64220960;57799104;70641792 | 67436960;80291488;64225632;61014432;70652064 | |
140 | resnetv23_stage3_conv19_fwd | Convolution | [256,1024,14,14] | 1473471.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.67 | 26332364800 | 157747914.67 | 37815754.67 | 24.80 | 134.65 | 12565.15 | false | 0.247167;0.248324;0.247468;0.248049;0.247041 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 155008192;158119904;157716416;157407424;164298976 | 37840224;37773280;37843872;37833760;37745184 | |
140 | resnetv23_stage3_conv19_fwd | Convolution | [256,1024,14,14] | 1473471.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.058078;0.058072;0.058077;0.057924;0.058722 | 0;0;0;0;0 | 2688;2432;2432;2688;4480 | 96;96;96;96;1120 | |
141 | resnetv23_stage3_batchnorm19_fwd | BatchNorm | [256,256,14,14] | 6379.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 52053568.00 | 55720917.33 | 88.20 | 0.75 | 564.34 | true | 0.882036;0.881890;0.882222;0.881954;0.881675 | 81264640;81264640;81264640;81264640;81264640 | 52062208;52045824;52054912;52059968;52013184 | 55686208;55789024;55688320;55664640;55788224 | |
142 | resnetv23_stage3_activation19 | Activation | [256,256,14,14] | 2775 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51376864.00 | 95.80 | 0.25 | 188.44 | true | 0.957814;0.957230;0.957821;0.958017;0.956552 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51368096;51378336;51376928;51381344;51375328 | |
143 | resnetv23_stage3_conv20_fwd | Convolution | [256,256,14,14] | 3392987.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1499.00 | 19365101568 | 84504245.33 | 77908362.67 | 24.70 | 119.23 | 12918.68 | false | 0.246445;0.246714;0.246692;0.246831;0.246648 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 83199424;83097344;86645120;94232096;83668192 | 77657312;78414528;77653248;79159616;76281664 | |
143 | resnetv23_stage3_conv20_fwd | Convolution | [256,256,14,14] | 3392987.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 275.67 | 251658240 | 51384309.33 | 150591893.33 | 47.10 | 1.25 | 912.91 | true | 0.472039;0.470798;0.471245;0.472350;0.470457 | 251658240;251658240;251658240;251658240;251658240 | 51392288;51415200;51375520;51372960;51385120 | 150605760;150552512;150595776;150642656;150574144 | |
143 | resnetv23_stage3_conv20_fwd | Convolution | [256,256,14,14] | 3392987.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.67 | 269484032 | 151336021.33 | 54206133.33 | 48.20 | 1.31 | 1033.82 | true | 0.482047;0.482462;0.482660;0.482126;0.482623 | 269484032;269484032;269484032;269484032;269484032 | 54309888;54181888;54093024;54173024;54263488 | 151329216;151342016;151336832;151327424;151345920 | |
143 | resnetv23_stage3_conv20_fwd | Convolution | [256,256,14,14] | 3392987.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9341621.33 | 30.70 | 0.45 | 312.26 | true | 0.303929;0.312776;0.309793;0.299891;0.307851 | 5308416;5308416;5308416;5308416;5308416 | 9294112;9378848;9353088;9314688;9357088 | 2359552;2359552;2359552;2359552;2359552 | |
144 | resnetv23_stage3_batchnorm20_fwd | BatchNorm | [256,256,14,14] | 6561 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51768704.00 | 55891360.00 | 88.20 | 0.75 | 568.28 | true | 0.882640;0.882872;0.881799;0.882277;0.882066 | 81264640;81264640;81264640;81264640;81264640 | 51758688;51776608;51775648;51753344;51771776 | 55879200;55897504;55912864;55872800;55897376 | |
145 | resnetv23_stage3_activation20 | Activation | [256,256,14,14] | 2790.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 137.00 | 25690112 | 51380576.00 | 51374368.00 | 95.60 | 0.25 | 187.52 | true | 0.956416;0.956536;0.955612;0.956366;0.957997 | 25690112;25690112;25690112;25690112;25690112 | 51374368;51387680;51375008;51367584;51373728 | 51380576;51380576;51380576;51380576;51380608 | |
146 | resnetv23_stage3_conv21_fwd | Convolution | [256,256,14,14] | 1415817.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2077.00 | 26409435136 | 154845269.33 | 153261386.67 | 24.90 | 85.72 | 12715.18 | false | 0.249001;0.248963;0.249041;0.248997;0.248959 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 153891936;155349632;154169088;156938208;155017088 | 151858176;154087456;151310144;155205440;153838528 | |
146 | resnetv23_stage3_conv21_fwd | Convolution | [256,256,14,14] | 1415817.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 3285.33 | 5.80 | 0.00 | 0.00 | true | 0.058083;0.058097;0.058234;0.057919;0.058044 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;4736;2432;9088 | |
147 | resnetv23_stage3__plus6 | elemwise_add | [256,1024,14,14] | 13237.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 4561120.00 | 5148842.67 | 95.70 | 5.29 | 69.34 | true | 0.957216;0.957115;0.956945;0.956957;0.957224 | 51380224;51380224;51380224;51380224;51380224 | 5616128;5533824;2533408;6078496;532480 | 6200384;6108032;3138112;6682144;991616 | |
148 | resnetv23_stage3_batchnorm21_fwd | BatchNorm | [256,1024,14,14] | 25189 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 56088138.67 | 60462442.67 | 90.20 | 2.79 | 586.40 | true | 0.901539;0.901698;0.901857;0.901946;0.901733 | 325058560;325058560;325058560;325058560;325058560 | 58241888;61480128;51772640;58249888;51761056 | 62786400;66254688;55814976;62785952;55804704 | |
149 | resnetv23_stage3_activation21 | Activation | [256,1024,14,14] | 10798.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 543.00 | 102760448 | 61014336.00 | 62081568.00 | 98.70 | 0.83 | 189.25 | true | 0.987167;0.986958;0.987547;0.986816;0.987557 | 102760448;102760448;102760448;102760448;102760448 | 57802880;64225696;61014144;57803168;67437728 | 57800256;64227488;64221376;57800832;64222496 | |
150 | resnetv23_stage3_conv22_fwd | Convolution | [256,1024,14,14] | 1477803.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.33 | 26332364800 | 157442922.67 | 37787328.00 | 24.80 | 134.88 | 12561.16 | false | 0.247893;0.247842;0.248254;0.247397;0.247535 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 158592160;157166624;155384480;156569984;159714176 | 37808832;37773120;37785856;37694752;37803008 | |
150 | resnetv23_stage3_conv22_fwd | Convolution | [256,1024,14,14] | 1477803.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058070;0.058039;0.058578;0.057888;0.058598 | 0;0;0;0;0 | 96;96;5984;96;96 | 2432;2176;16512;2432;2432 | |
151 | resnetv23_stage3_batchnorm22_fwd | BatchNorm | [256,256,14,14] | 6345 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52053397.33 | 55742890.67 | 88.20 | 0.75 | 565.65 | true | 0.882271;0.881591;0.882427;0.881519;0.882775 | 81264640;81264640;81264640;81264640;81264640 | 52051744;52050944;52049120;52067328;52057504 | 55746016;55764224;55755136;55680448;55727520 | |
152 | resnetv23_stage3_activation22 | Activation | [256,256,14,14] | 2742.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51370037.33 | 95.70 | 0.25 | 188.44 | true | 0.956818;0.956428;0.956544;0.956513;0.957509 | 25690112;25690112;25690112;25690112;25690112 | 51362464;51368000;51369408;51374720;51372704 | 51380576;51380576;51380608;51380576;51380576 | |
153 | resnetv23_stage3_conv23_fwd | Convolution | [256,256,14,14] | 3393545.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.33 | 19365101568 | 84114869.33 | 77431221.33 | 24.70 | 119.87 | 12933.06 | false | 0.246860;0.247026;0.246710;0.247116;0.247072 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 89956992;84569440;78349856;84397504;83377664 | 78617920;77478176;77473056;77123648;77342432 | |
153 | resnetv23_stage3_conv23_fwd | Convolution | [256,256,14,14] | 3393545.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.00 | 251658240 | 51389088.00 | 150601888.00 | 47.10 | 1.25 | 911.81 | true | 0.471014;0.471230;0.472065;0.470672;0.470511 | 251658240;251658240;251658240;251658240;251658240 | 51388000;51385312;51393952;51363424;51403424 | 150612800;150565440;150589760;150603104;150625728 | |
153 | resnetv23_stage3_conv23_fwd | Convolution | [256,256,14,14] | 3393545.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 258.00 | 269484032 | 151377077.33 | 54370602.67 | 48.30 | 1.31 | 1044.51 | true | 0.482191;0.483067;0.482415;0.482668;0.483164 | 269484032;269484032;269484032;269484032;269484032 | 54426144;54398112;54238752;54347584;54366112 | 151385824;151397152;151362304;151360128;151383104 | |
153 | resnetv23_stage3_conv23_fwd | Convolution | [256,256,14,14] | 3393545.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9325717.33 | 30.60 | 0.45 | 312.26 | true | 0.302378;0.303988;0.309809;0.308753;0.305796 | 5308416;5308416;5308416;5308416;5308416 | 9342080;9327232;9319808;9324960;9324960 | 2359552;2359552;2361600;2359552;2359552 | |
154 | resnetv23_stage3_batchnorm23_fwd | BatchNorm | [256,256,14,14] | 6396.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51769141.33 | 55921696.00 | 88.20 | 0.75 | 568.28 | true | 0.879770;0.881587;0.881711;0.881687;0.881680 | 81264640;81264640;81264640;81264640;81264640 | 51774368;51753280;51766848;51779936;51766208 | 55894496;55947328;55966272;55899392;55918368 | |
155 | resnetv23_stage3_activation23 | Activation | [256,256,14,14] | 2761 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51373610.67 | 95.70 | 0.25 | 187.98 | true | 0.956973;0.957958;0.955745;0.955996;0.956565 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51374944;51369760;51375040;51374464;51371424 | |
156 | resnetv23_stage3_conv24_fwd | Convolution | [256,256,14,14] | 1422148.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2081.67 | 26409435136 | 156147690.67 | 152853802.67 | 24.90 | 85.47 | 12686.68 | false | 0.249015;0.249014;0.248890;0.248966;0.248991 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 153858656;156083776;155575520;156783776;158647840 | 153692416;151494464;154161824;152161696;152707296 | |
156 | resnetv23_stage3_conv24_fwd | Convolution | [256,256,14,14] | 1422148.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 4949.33 | 5.80 | 0.00 | 0.00 | true | 0.058036;0.058073;0.058078;0.057897;0.058665 | 0;0;0;0;0 | 96;8544;96;96;96 | 2432;22400;9600;2560;2688 | |
157 | resnetv23_stage3__plus7 | elemwise_add | [256,1024,14,14] | 13147.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.33 | 51380224 | 138.67 | 533.33 | 95.70 | 76458.67 | 69.40 | false | 0.956592;0.956547;0.956591;0.956395;0.956636 | 51380224;51380224;51380224;51380224;51380224 | 160;128;128;1152;128 | 512;512;576;2816;448 | |
158 | resnetv23_stage3_batchnorm24_fwd | BatchNorm | [256,1024,14,14] | 25240.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 57155008.00 | 61622880.00 | 90.20 | 2.74 | 585.69 | true | 0.901996;0.901660;0.902076;0.902028;0.902008 | 325058560;325058560;325058560;325058560;325058560 | 48524096;80878144;58233792;61472608;51758624 | 55816160;90682944;62781184;66271296;55809952 | |
159 | resnetv23_stage3_activation24 | Activation | [256,1024,14,14] | 10861.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.00 | 102760448 | 76000330.67 | 74927370.67 | 98.70 | 0.68 | 190.30 | true | 0.987237;0.987027;0.986726;0.987553;0.986543 | 102760448;102760448;102760448;102760448;102760448 | 67436960;93127072;61014144;83493280;77070752 | 64221952;96339328;61009248;83492384;77067776 | |
160 | resnetv23_stage3_conv25_fwd | Convolution | [256,1024,14,14] | 1475154 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2098.00 | 26332364800 | 158976736.00 | 37809056.00 | 24.80 | 133.81 | 12551.17 | false | 0.246799;0.247533;0.247500;0.248261;0.248422 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 153247584;164212992;155158336;163680160;158091712 | 37825280;37827200;37852064;37774688;37752960 | |
160 | resnetv23_stage3_conv25_fwd | Convolution | [256,1024,14,14] | 1475154 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2474.67 | 5.80 | 0.00 | 0.00 | true | 0.058102;0.058085;0.058051;0.057926;0.058089 | 0;0;0;0;0 | 2560;2944;2432;2432;2432 | 96;96;96;96;96 | |
161 | resnetv23_stage3_batchnorm25_fwd | BatchNorm | [256,256,14,14] | 6287 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 52049717.33 | 55721546.67 | 88.20 | 0.75 | 564.34 | true | 0.882552;0.883033;0.881852;0.882078;0.882492 | 81264640;81264640;81264640;81264640;81264640 | 55780256;55771616;55673408;55719616;55669792 | 52030336;52045952;52050752;52053664;52052448 | |
162 | resnetv23_stage3_activation25 | Activation | [256,256,14,14] | 2781.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51372202.67 | 95.70 | 0.25 | 188.44 | true | 0.956079;0.957516;0.957520;0.956760;0.956993 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51366464;51369792;51373184;51373632;51381984 | |
163 | resnetv23_stage3_conv26_fwd | Convolution | [256,256,14,14] | 3395801 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.67 | 19365101568 | 86682624.00 | 76840138.67 | 24.70 | 118.42 | 12930.18 | false | 0.246644;0.246601;0.246727;0.246670;0.247014 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 88731392;89909472;76573280;81407008;90303872 | 75610400;75823840;76961408;77735168;79112768 | |
163 | resnetv23_stage3_conv26_fwd | Convolution | [256,256,14,14] | 3395801 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.67 | 251658240 | 51384309.33 | 150608469.33 | 47.10 | 1.25 | 909.61 | true | 0.471332;0.470691;0.471672;0.471368;0.470562 | 251658240;251658240;251658240;251658240;251658240 | 150644288;150594272;150652224;150586848;150586720 | 51375584;51354784;51379680;51397664;51402272 | |
163 | resnetv23_stage3_conv26_fwd | Convolution | [256,256,14,14] | 3395801 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 259.33 | 269484032 | 151342005.33 | 54098784.00 | 48.20 | 1.31 | 1039.14 | true | 0.483055;0.482582;0.482161;0.482326;0.482289 | 269484032;269484032;269484032;269484032;269484032 | 151335808;151346336;151344768;151345440;151302688 | 54018080;54146208;54178496;53971968;54132064 | |
163 | resnetv23_stage3_conv26_fwd | Convolution | [256,256,14,14] | 3395801 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9309696.00 | 30.60 | 0.45 | 325.01 | true | 0.307687;0.314702;0.304258;0.301208;0.306480 | 5308416;5308416;5308416;5308416;5308416 | 9279232;9351808;9261312;9352704;9298048 | 2359552;2359552;2359552;2359552;2359552 | |
164 | resnetv23_stage3_batchnorm26_fwd | BatchNorm | [256,256,14,14] | 6267.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51774570.67 | 55883552.00 | 88.20 | 0.75 | 568.28 | true | 0.881633;0.882161;0.882305;0.881558;0.881923 | 81264640;81264640;81264640;81264640;81264640 | 55849024;55917856;55915424;55824000;55886208 | 51777984;51779072;51766656;51780768;51760192 | |
165 | resnetv23_stage3_activation26 | Activation | [256,256,14,14] | 2869 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51375818.67 | 95.70 | 0.25 | 187.98 | true | 0.957169;0.956868;0.956820;0.956807;0.957106 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380608 | 51377152;51378368;51372576;51377728;51372064 | |
166 | resnetv23_stage3_conv27_fwd | Convolution | [256,256,14,14] | 1424178 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2078.00 | 26409435136 | 154424416.00 | 151822154.67 | 24.90 | 86.24 | 12709.06 | false | 0.248926;0.249019;0.248922;0.248988;0.248980 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 155109056;154355808;154466368;154451072;154077248 | 151698176;151425888;152306496;151461792;155098656 | |
166 | resnetv23_stage3_conv27_fwd | Convolution | [256,256,14,14] | 1424178 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2560.00 | 5.80 | 0.00 | 0.00 | true | 0.058087;0.058083;0.058246;0.057972;0.058058 | 0;0;0;0;0 | 2688;2560;2432;2688;2432 | 96;96;96;96;96 | |
167 | resnetv23_stage3__plus8 | elemwise_add | [256,1024,14,14] | 13266.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 3766581.33 | 4368832.00 | 95.70 | 6.32 | 69.34 | true | 0.957173;0.956679;0.956918;0.956842;0.957079 | 51380224;51380224;51380224;51380224;51380224 | 6249376;3293312;3216704;128;4789728 | 6867168;3851072;3833408;544;5422016 | |
168 | resnetv23_stage3_batchnorm27_fwd | BatchNorm | [256,1024,14,14] | 25193 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 57163594.67 | 61619690.67 | 90.20 | 2.74 | 585.69 | true | 0.902117;0.901928;0.901599;0.902070;0.901844 | 325058560;325058560;325058560;325058560;325058560 | 51773728;61474080;51766592;58242976;67954048 | 55812640;66254112;55795840;62792320;76735136 | |
169 | resnetv23_stage3_activation27 | Activation | [256,1024,14,14] | 10818.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.67 | 102760448 | 64225770.67 | 64223925.33 | 98.70 | 0.80 | 189.71 | true | 0.987091;0.987978;0.987286;0.987079;0.987632 | 102760448;102760448;102760448;102760448;102760448 | 67436672;61014432;96338336;64226208;61014432 | 64222080;64225920;96333664;64223776;64220608 | |
170 | resnetv23_stage3_conv28_fwd | Convolution | [256,1024,14,14] | 1476209 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2093.00 | 26332364800 | 158691594.67 | 37603285.33 | 24.80 | 134.15 | 12581.16 | false | 0.248484;0.247811;0.247684;0.247736;0.247220 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 37764384;37768928;37152448;37734432;37311040 | 164549216;157286880;161206880;155542848;157581024 | |
170 | resnetv23_stage3_conv28_fwd | Convolution | [256,1024,14,14] | 1476209 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2474.67 | 5.80 | 0.00 | 0.00 | true | 0.058400;0.058102;0.058075;0.057924;0.058641 | 0;0;0;0;0 | 96;96;96;96;96 | 2560;2432;2432;2432;2688 | |
171 | resnetv23_stage3_batchnorm28_fwd | BatchNorm | [256,256,14,14] | 6369.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52051082.67 | 55745461.33 | 88.30 | 0.75 | 566.96 | true | 0.881818;0.883334;0.883346;0.882440;0.881922 | 81264640;81264640;81264640;81264640;81264640 | 52048384;52056832;52050240;52028032;52054624 | 55764576;55751040;55720768;55712416;55765920 | |
172 | resnetv23_stage3_activation28 | Activation | [256,256,14,14] | 2757 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380608.00 | 51371392.00 | 95.70 | 0.25 | 188.44 | true | 0.958555;0.956479;0.957856;0.956493;0.956706 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380608;51380608;51380640 | 51373344;51373184;51371584;51369408;51368832 | |
173 | resnetv23_stage3_conv29_fwd | Convolution | [256,256,14,14] | 3392080.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1505.67 | 19365101568 | 80968885.33 | 77020010.67 | 24.70 | 122.57 | 12861.48 | false | 0.246619;0.246511;0.246905;0.246603;0.246770 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 86670944;80891648;77396896;83967456;78047552 | 75872256;77859392;75564192;78083264;77328384 | |
173 | resnetv23_stage3_conv29_fwd | Convolution | [256,256,14,14] | 3392080.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 277.00 | 251658240 | 51376266.67 | 150576800.00 | 47.10 | 1.25 | 908.51 | true | 0.472280;0.470688;0.471620;0.470726;0.470882 | 251658240;251658240;251658240;251658240;251658240 | 150563136;150612064;150587968;150532192;150579296 | 51397536;51323680;51387040;51365344;51376416 | |
173 | resnetv23_stage3_conv29_fwd | Convolution | [256,256,14,14] | 3392080.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 258.67 | 269484032 | 151384042.67 | 54400821.33 | 48.30 | 1.31 | 1041.82 | true | 0.483198;0.482707;0.482823;0.482770;0.482506 | 269484032;269484032;269484032;269484032;269484032 | 54424032;54448736;54332032;54353440;54424992 | 151377696;151358176;151392800;151388512;151385920 | |
173 | resnetv23_stage3_conv29_fwd | Convolution | [256,256,14,14] | 3392080.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9345674.67 | 30.50 | 0.45 | 312.26 | true | 0.304070;0.302947;0.301825;0.309311;0.307699 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9346176;9328384;9339648;9407232;9351200 | |
174 | resnetv23_stage3_batchnorm29_fwd | BatchNorm | [256,256,14,14] | 6395.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51773589.33 | 55937386.67 | 88.20 | 0.75 | 568.28 | true | 0.882124;0.881911;0.882389;0.881533;0.882464 | 81264640;81264640;81264640;81264640;81264640 | 51778976;51774240;51767200;51767552;51779392 | 55884256;55949216;55957632;55940160;55922784 | |
175 | resnetv23_stage3_activation29 | Activation | [256,256,14,14] | 2755 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 137.00 | 25690112 | 51380576.00 | 51367125.33 | 95.70 | 0.25 | 187.52 | true | 0.955047;0.956889;0.955814;0.957068;0.957179 | 25690112;25690112;25690112;25690112;25690112 | 51356512;51368288;51370752;51362336;51371456 | 51380576;51380576;51380576;51380576;51380576 | |
176 | resnetv23_stage3_conv30_fwd | Convolution | [256,256,14,14] | 1420087.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2075.33 | 26409435136 | 156943530.67 | 152283637.33 | 24.90 | 85.40 | 12725.40 | false | 0.249011;0.248986;0.248979;0.248951;0.249034 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 156629280;158524064;154077472;160576448;155677248 | 151716384;153772096;151362432;153863232;151322144 | |
176 | resnetv23_stage3_conv30_fwd | Convolution | [256,256,14,14] | 1420087.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058049;0.058067;0.058287;0.057890;0.058126 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2688;2432;2432 | |
177 | resnetv23_stage3__plus9 | elemwise_add | [256,1024,14,14] | 13263.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.67 | 51380224 | 2850314.67 | 3443616.00 | 95.70 | 8.16 | 69.37 | true | 0.956694;0.956533;0.956871;0.956514;0.956657 | 51380224;51380224;51380224;51380224;51380224 | 6157440;1649056;6691296;1633248;2524352 | 5577728;1075712;6087392;1062176;1897504 | |
178 | resnetv23_stage3_batchnorm30_fwd | BatchNorm | [256,1024,14,14] | 26059.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 60444778.67 | 63967360.00 | 90.00 | 2.61 | 586.40 | true | 0.900375;0.900231;0.900268;0.900383;0.900181 | 325058560;325058560;325058560;325058560;325058560 | 61522144;64754080;58280544;61531648;55046496 | 66302144;69780608;62795424;62804512;55808448 | |
179 | resnetv23_stage3_activation30 | Activation | [256,1024,14,14] | 10791.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 539.67 | 102760448 | 62084661.33 | 62080576.00 | 98.70 | 0.83 | 190.41 | true | 0.987295;0.987300;0.986855;0.987076;0.987265 | 102760448;102760448;102760448;102760448;102760448 | 61014144;64225696;61014144;57809824;73859488 | 64218944;64222880;57798720;57799904;73858272 | |
180 | resnetv23_stage3_conv31_fwd | Convolution | [256,1024,14,14] | 1476154 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.33 | 26332364800 | 156305898.67 | 37776746.67 | 24.70 | 135.68 | 12561.16 | false | 0.247190;0.247284;0.247301;0.246888;0.247428 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 158808064;155600160;157092544;154176000;156224992 | 37732480;37761568;37817280;37811744;37756928 | |
180 | resnetv23_stage3_conv31_fwd | Convolution | [256,1024,14,14] | 1476154 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058114;0.058078;0.058073;0.057900;0.058109 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2432;2688;2176 | |
181 | resnetv23_stage3_batchnorm31_fwd | BatchNorm | [256,256,14,14] | 6287 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 52048693.33 | 55704266.67 | 88.30 | 0.75 | 564.34 | true | 0.882924;0.882711;0.882087;0.882549;0.882798 | 81264640;81264640;81264640;81264640;81264640 | 52051936;52056288;52062944;52027168;52037856 | 55669280;55707744;55649440;55789088;55735776 | |
182 | resnetv23_stage3_activation31 | Activation | [256,256,14,14] | 2753.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51369440.00 | 95.70 | 0.25 | 188.44 | true | 0.957090;0.956325;0.956030;0.956960;0.958526 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51376160;51367712;51367712;51372896;51366624 | |
183 | resnetv23_stage3_conv32_fwd | Convolution | [256,256,14,14] | 3393470 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.00 | 19365101568 | 84198709.33 | 77814133.33 | 24.70 | 119.53 | 12927.30 | false | 0.246420;0.247013;0.246454;0.246924;0.246718 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 80034944;86856416;83170560;82569152;91495840 | 77545440;78078912;77352160;77818048;79123296 | |
183 | resnetv23_stage3_conv32_fwd | Convolution | [256,256,14,14] | 3393470 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.33 | 251658240 | 51384928.00 | 150619850.67 | 47.10 | 1.25 | 910.71 | true | 0.471251;0.470036;0.471335;0.471008;0.470743 | 251658240;251658240;251658240;251658240;251658240 | 51372512;51406560;51375392;51385184;51394208 | 150605920;150653760;150594624;150599872;150702176 | |
183 | resnetv23_stage3_conv32_fwd | Convolution | [256,256,14,14] | 3393470 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.00 | 269484032 | 151332042.67 | 54172618.67 | 48.30 | 1.31 | 1036.48 | true | 0.482955;0.483147;0.481930;0.482737;0.482672 | 269484032;269484032;269484032;269484032;269484032 | 151364832;151311136;151317760;151328768;151349600 | 54178624;54154080;54185152;54027712;54253920 | |
183 | resnetv23_stage3_conv32_fwd | Convolution | [256,256,14,14] | 3393470 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9332064.00 | 30.20 | 0.45 | 318.50 | true | 0.309417;0.303501;0.301836;0.301810;0.300967 | 5308416;5308416;5308416;5308416;5308416 | 2364672;2359552;2359552;2359552;2359552 | 9366016;9307040;9355648;9333504;9238560 | |
184 | resnetv23_stage3_batchnorm32_fwd | BatchNorm | [256,256,14,14] | 6429.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51766432.00 | 55900170.67 | 88.20 | 0.75 | 568.28 | true | 0.882016;0.882377;0.882327;0.880897;0.882381 | 81264640;81264640;81264640;81264640;81264640 | 51773088;51770016;51756192;51754592;51788512 | 55903712;55874336;55892160;55904640;55915072 | |
185 | resnetv23_stage3_activation32 | Activation | [256,256,14,14] | 2763.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51372800.00 | 95.80 | 0.25 | 188.44 | true | 0.957369;0.956937;0.958004;0.957232;0.958124 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380576;51380576;51380576 | 51402688;51371584;51370304;51374880;51371936 | |
186 | resnetv23_stage3_conv33_fwd | Convolution | [256,256,14,14] | 1414221 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2083.33 | 26409435136 | 154814410.67 | 152762069.33 | 24.90 | 85.86 | 12676.53 | false | 0.248971;0.248933;0.248549;0.248918;0.248779 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 152561952;151151552;154146080;151578176;154639744 | 157838816;155229536;151394560;155474464;153739232 | |
186 | resnetv23_stage3_conv33_fwd | Convolution | [256,256,14,14] | 1414221 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058089;0.058521;0.058061;0.057924;0.058217 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2432;2432;2432 | |
187 | resnetv23_stage3__plus10 | elemwise_add | [256,1024,14,14] | 13244.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 2420629.33 | 2913653.33 | 95.70 | 9.63 | 69.34 | true | 0.957073;0.956962;0.956872;0.956846;0.957514 | 51380224;51380224;51380224;51380224;51380224 | 6081088;128;2283008;4718112;260768 | 6679008;640;2862336;5298464;580160 | |
188 | resnetv23_stage3_batchnorm33_fwd | BatchNorm | [256,1024,14,14] | 25196.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 55052309.33 | 59313290.67 | 90.00 | 2.84 | 586.40 | true | 0.899846;0.900542;0.900063;0.899803;0.900265 | 325058560;325058560;325058560;325058560;325058560 | 58285856;55059456;51812896;58284576;51809440 | 62804576;59315584;55819488;62802048;55822240 | |
189 | resnetv23_stage3_activation33 | Activation | [256,1024,14,14] | 10761 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.00 | 102760448 | 69577877.33 | 69575157.33 | 98.70 | 0.74 | 190.30 | true | 0.987545;0.987645;0.987485;0.987174;0.987443 | 102760448;102760448;102760448;102760448;102760448 | 54591616;77070752;67436672;64226208;80282016 | 51376384;77071584;67430912;64222976;80276672 | |
190 | resnetv23_stage3_conv34_fwd | Convolution | [256,1024,14,14] | 1475214 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.00 | 26332364800 | 156484928.00 | 37783125.33 | 24.80 | 135.55 | 12569.15 | false | 0.247604;0.247790;0.247874;0.247937;0.247134 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 157647520;164158848;156630720;155176544;154872384 | 37777056;37708352;37822944;37785792;37786528 | |
190 | resnetv23_stage3_conv34_fwd | Convolution | [256,1024,14,14] | 1475214 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058094;0.058106;0.058087;0.058324;0.058094 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2688;2432;2432 | |
191 | resnetv23_stage3_batchnorm34_fwd | BatchNorm | [256,256,14,14] | 6352 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52061013.33 | 55750880.00 | 88.30 | 0.75 | 565.65 | true | 0.882064;0.882962;0.882726;0.882954;0.883090 | 81264640;81264640;81264640;81264640;81264640 | 52067456;52062592;52074432;52051872;52052992 | 55753888;55743648;55672384;55755104;55779488 | |
192 | resnetv23_stage3_activation34 | Activation | [256,256,14,14] | 2765 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51365717.33 | 95.70 | 0.25 | 188.44 | true | 0.957542;0.955336;0.957175;0.957269;0.957403 | 25690112;25690112;25690112;25690112;25690112 | 51365280;51358688;51370848;51371520;51361024 | 51380576;51380576;51380832;51380576;51380576 | |
193 | resnetv23_stage3_conv35_fwd | Convolution | [256,256,14,14] | 3397056 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.00 | 19365101568 | 86756960.00 | 77224693.33 | 24.70 | 118.09 | 12927.30 | false | 0.247089;0.246857;0.246976;0.246775;0.246681 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 87301856;84558336;80205952;89270144;88410688 | 77556768;76868416;77248896;77621504;72641536 | |
193 | resnetv23_stage3_conv35_fwd | Convolution | [256,256,14,14] | 3397056 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.33 | 251658240 | 51380576.00 | 150593034.67 | 47.10 | 1.25 | 910.71 | true | 0.471247;0.471203;0.471471;0.470448;0.471205 | 251658240;251658240;251658240;251658240;251658240 | 150603360;150591712;150560448;150618848;150584032 | 51381920;51354016;51420384;51372384;51387424 | |
193 | resnetv23_stage3_conv35_fwd | Convolution | [256,256,14,14] | 3397056 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 258.33 | 269484032 | 151381557.33 | 54352949.33 | 48.30 | 1.31 | 1043.17 | true | 0.482800;0.483395;0.482282;0.483135;0.482491 | 269484032;269484032;269484032;269484032;269484032 | 151373088;151396352;151384256;151375360;151385056 | 54298560;54425120;54373952;54272384;54386336 | |
193 | resnetv23_stage3_conv35_fwd | Convolution | [256,256,14,14] | 3397056 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9364032.00 | 30.60 | 0.45 | 325.01 | true | 0.306842;0.304657;0.305842;0.301988;0.308450 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9349920;9344416;9398016;9349280;9392896 | |
194 | resnetv23_stage3_batchnorm35_fwd | BatchNorm | [256,256,14,14] | 6395.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 51770186.67 | 55947584.00 | 88.20 | 0.75 | 566.96 | true | 0.883095;0.882779;0.882483;0.881884;0.881591 | 81264640;81264640;81264640;81264640;81264640 | 51762880;51773952;51768128;51780000;51768480 | 55931104;55964512;55900224;55955584;55956064 | |
195 | resnetv23_stage3_activation35 | Activation | [256,256,14,14] | 2728.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 137.00 | 25690112 | 51380576.00 | 51373792.00 | 95.70 | 0.25 | 187.52 | true | 0.957628;0.957073;0.957587;0.957256;0.957130 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51390560;51380576 | 51379840;51366688;51371424;51379968;51370112 | |
196 | resnetv23_stage3_conv36_fwd | Convolution | [256,256,14,14] | 1422437.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2080.67 | 26409435136 | 155639477.33 | 152449290.67 | 24.90 | 85.72 | 12692.77 | false | 0.248968;0.248984;0.249013;0.248979;0.248903 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 154081952;156769088;153797152;156067392;157061664 | 149972832;154994784;150795168;154200576;152352128 | |
196 | resnetv23_stage3_conv36_fwd | Convolution | [256,256,14,14] | 1422437.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2474.67 | 5.80 | 0.00 | 0.00 | true | 0.058048;0.058065;0.058387;0.057920;0.058089 | 0;0;0;0;0 | 96;96;96;96;96 | 2560;2432;2432;2432;2688 | |
197 | resnetv23_stage3__plus11 | elemwise_add | [256,1024,14,14] | 13178.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.33 | 51380224 | 4290880.00 | 4867317.33 | 95.70 | 5.61 | 69.40 | true | 0.956856;0.956654;0.956686;0.956654;0.956522 | 51380224;51380224;51380224;51380224;51380224 | 1503488;5186688;2312000;5384160;5373952 | 2094496;5759520;2892064;5972736;5950368 | |
198 | resnetv23_stage3_batchnorm36_fwd | BatchNorm | [256,1024,14,14] | 25346.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 58288213.33 | 61628576.00 | 90.00 | 2.71 | 585.69 | true | 0.900516;0.900117;0.899923;0.900223;0.900316 | 325058560;325058560;325058560;325058560;325058560 | 51808512;61529664;61523072;64757504;51811904 | 55810304;62801024;66274400;69796640;55808960 | |
199 | resnetv23_stage3_activation36 | Activation | [256,1024,14,14] | 11059 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 544.67 | 102760448 | 71720778.67 | 71713077.33 | 98.70 | 0.72 | 188.67 | true | 0.987385;0.987401;0.987587;0.987138;0.987270 | 102760448;102760448;102760448;102760448;102760448 | 57799008;70646400;89911840;70639872;73852960 | 57802880;73865888;93127072;67436960;73859488 | |
200 | resnetv23_stage3_conv37_fwd | Convolution | [256,1024,14,14] | 1477710.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2093.67 | 26332364800 | 158103424.00 | 37817578.67 | 24.80 | 134.40 | 12577.15 | false | 0.247600;0.247440;0.247465;0.248098;0.248850 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 157995520;156699744;161671648;156054272;159615008 | 37851328;37794624;37823968;37766240;37834144 | |
200 | resnetv23_stage3_conv37_fwd | Convolution | [256,1024,14,14] | 1477710.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058112;0.058089;0.058061;0.057908;0.058080 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2432;2432;2432 | |
201 | resnetv23_stage3_batchnorm37_fwd | BatchNorm | [256,256,14,14] | 6296 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52034528.00 | 55750944.00 | 88.30 | 0.75 | 565.65 | true | 0.882737;0.882953;0.882490;0.883023;0.883143 | 81264640;81264640;81264640;81264640;81264640 | 55782432;55736640;55751488;55699968;55764704 | 52028416;52057152;52025760;52049408;52023264 | |
202 | resnetv23_stage3_activation37 | Activation | [256,256,14,14] | 2760.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380586.67 | 51371893.33 | 95.80 | 0.25 | 187.98 | true | 0.957486;0.954774;0.957639;0.957569;0.957507 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380832;51380576;51380608;51380576 | 51369792;51373984;51379072;51371904;51369728 | |
203 | resnetv23_stage3_conv38_fwd | Convolution | [256,256,14,14] | 3395867.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.67 | 19365101568 | 84873920.00 | 76274154.67 | 24.70 | 120.17 | 12930.18 | false | 0.246453;0.246830;0.247059;0.246904;0.246787 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 88083680;85222656;86243328;83155776;82804992 | 77797408;77496480;74025120;77300864;71516544 | |
203 | resnetv23_stage3_conv38_fwd | Convolution | [256,256,14,14] | 3395867.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.33 | 251658240 | 51373642.67 | 150575285.33 | 47.10 | 1.25 | 910.71 | true | 0.470972;0.472148;0.471389;0.471753;0.471289 | 251658240;251658240;251658240;251658240;251658240 | 51388320;51377824;51381664;51361440;51357152 | 150555488;150686816;150550336;150574048;150596320 | |
203 | resnetv23_stage3_conv38_fwd | Convolution | [256,256,14,14] | 3395867.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.00 | 269484032 | 151342517.33 | 54188298.67 | 48.30 | 1.31 | 1036.48 | true | 0.482723;0.482614;0.482852;0.482344;0.482217 | 269484032;269484032;269484032;269484032;269484032 | 151369120;151336480;151340960;151340480;151346112 | 54240256;54235776;54194624;54134496;54122944 | |
203 | resnetv23_stage3_conv38_fwd | Convolution | [256,256,14,14] | 3395867.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9335552.00 | 30.80 | 0.45 | 312.26 | true | 0.308024;0.316593;0.312428;0.298685;0.304190 | 5308416;5308416;5308416;5308416;5308416 | 9385248;9191808;9358208;9344384;9304064 | 2359552;2359552;2364928;2359552;2359552 | |
204 | resnetv23_stage3_batchnorm38_fwd | BatchNorm | [256,256,14,14] | 6414.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51771178.67 | 55894453.33 | 88.20 | 0.75 | 568.28 | true | 0.882451;0.882145;0.881789;0.881605;0.882216 | 81264640;81264640;81264640;81264640;81264640 | 51779328;51756128;51752480;51783808;51778080 | 55893920;55895680;55924640;55893760;55847264 | |
205 | resnetv23_stage3_activation38 | Activation | [256,256,14,14] | 2747.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380597.33 | 51369845.33 | 95.80 | 0.25 | 188.44 | true | 0.959327;0.958202;0.958505;0.958141;0.957445 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380832;51380576;51380608 | 51365344;51374976;51364160;51374272;51369920 | |
206 | resnetv23_stage3_conv39_fwd | Convolution | [256,256,14,14] | 1416696.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2081.33 | 26409435136 | 155882261.33 | 151920618.67 | 24.90 | 85.80 | 12688.71 | false | 0.248941;0.249042;0.248988;0.248668;0.248967 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 155202880;154333824;156868160;155575744;158657696 | 151267392;151523712;152375584;151862560;152676864 | |
206 | resnetv23_stage3_conv39_fwd | Convolution | [256,256,14,14] | 1416696.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 3840.00 | 5.80 | 0.00 | 0.00 | true | 0.058070;0.058130;0.058061;0.057932;0.058048 | 0;0;0;0;0 | 96;96;96;608;96 | 2688;2432;12928;3200;5632 | |
207 | resnetv23_stage3__plus12 | elemwise_add | [256,1024,14,14] | 13273 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 3815754.67 | 4394176.00 | 95.70 | 6.26 | 69.34 | true | 0.957109;0.956846;0.956763;0.956896;0.957072 | 51380224;51380224;51380224;51380224;51380224 | 6604576;2800224;5292160;5090144;4288 | 6009504;2224416;4711008;4511840;128 | |
208 | resnetv23_stage3_batchnorm39_fwd | BatchNorm | [256,1024,14,14] | 25141.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 59367690.67 | 63961173.33 | 90.00 | 2.64 | 586.40 | true | 0.900520;0.900057;0.899924;0.899811;0.900240 | 325058560;325058560;325058560;325058560;325058560 | 62796576;83729696;62805696;62798528;66279296 | 58288480;80964768;55061280;58294528;61520064 | |
209 | resnetv23_stage3_activation39 | Activation | [256,1024,14,14] | 10794.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.33 | 102760448 | 67436970.67 | 66363637.33 | 98.70 | 0.77 | 189.83 | true | 0.987533;0.986750;0.986818;0.987397;0.987423 | 102760448;102760448;102760448;102760448;102760448 | 64225408;73859488;83496864;64225696;64225728 | 64219808;70647840;83492352;64223168;64219904 | |
210 | resnetv23_stage3_conv40_fwd | Convolution | [256,1024,14,14] | 1478071 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.33 | 26332364800 | 160822762.67 | 37800138.67 | 24.80 | 132.57 | 12567.15 | false | 0.247927;0.248160;0.247519;0.247571;0.247404 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 158688064;164623904;157039328;166310944;159156320 | 37764192;37807424;37830016;37778944;37814048 | |
210 | resnetv23_stage3_conv40_fwd | Convolution | [256,1024,14,14] | 1478071 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058061;0.058080;0.058075;0.057922;0.058087 | 0;0;0;0;0 | 96;96;96;2912;96 | 2432;2432;2432;11648;2688 | |
211 | resnetv23_stage3_batchnorm40_fwd | BatchNorm | [256,256,14,14] | 6368 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 52044010.67 | 55780362.67 | 88.30 | 0.75 | 568.28 | true | 0.882873;0.883191;0.882796;0.882272;0.883201 | 81264640;81264640;81264640;81264640;81264640 | 55769312;55729088;55811680;55818752;55760096 | 52043136;52044864;52044032;52031552;52051200 | |
212 | resnetv23_stage3_activation40 | Activation | [256,256,14,14] | 2744.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51369194.67 | 95.70 | 0.25 | 188.44 | true | 0.957947;0.954933;0.957803;0.956631;0.955569 | 25690112;25690112;25690112;25690112;25690112 | 51373696;51362112;51369952;51369248;51368384 | 51380576;51380576;51380576;51380576;51380576 | |
213 | resnetv23_stage3_conv41_fwd | Convolution | [256,256,14,14] | 3393810.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1505.33 | 19365101568 | 84302709.33 | 76297024.00 | 24.70 | 120.58 | 12864.33 | false | 0.246996;0.246470;0.246756;0.246646;0.246732 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 82455072;80770496;87962464;85677440;84775616 | 75623968;75989792;77930752;75589568;77277312 | |
213 | resnetv23_stage3_conv41_fwd | Convolution | [256,256,14,14] | 3393810.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 277.33 | 251658240 | 51372469.33 | 150616736.00 | 47.10 | 1.25 | 907.42 | true | 0.470976;0.470840;0.471124;0.470467;0.471595 | 251658240;251658240;251658240;251658240;251658240 | 51405024;51363744;51360800;51388448;51365216 | 150628672;150588224;150563392;150633312;150656320 | |
213 | resnetv23_stage3_conv41_fwd | Convolution | [256,256,14,14] | 3393810.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 258.00 | 269484032 | 151377546.67 | 54325344.00 | 48.20 | 1.31 | 1044.51 | true | 0.482466;0.482222;0.482538;0.482396;0.482966 | 269484032;269484032;269484032;269484032;269484032 | 151366944;151378528;151370592;151383520;151400192 | 54345440;54358208;54195552;54272384;54431776 | |
213 | resnetv23_stage3_conv41_fwd | Convolution | [256,256,14,14] | 3393810.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9329418.67 | 30.90 | 0.45 | 312.26 | true | 0.303481;0.303410;0.312888;0.318175;0.310213 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2366208;2359552 | 9311744;9370400;9364352;9312160;9270016 | |
214 | resnetv23_stage3_batchnorm41_fwd | BatchNorm | [256,256,14,14] | 6310.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51760384.00 | 55934784.00 | 88.20 | 0.75 | 568.28 | true | 0.881605;0.882321;0.883027;0.882069;0.881719 | 81264640;81264640;81264640;81264640;81264640 | 51743264;51763040;51758688;51759424;51768096 | 55877056;55951072;55943392;55936416;55924544 | |
215 | resnetv23_stage3_activation41 | Activation | [256,256,14,14] | 2754 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51372693.33 | 95.80 | 0.25 | 187.98 | true | 0.956238;0.958653;0.957762;0.958232;0.958144 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51379872;51369888;51374592;51370784;51372704 | |
216 | resnetv23_stage3_conv42_fwd | Convolution | [256,256,14,14] | 1424257.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2078.33 | 26409435136 | 155794656.00 | 152001674.67 | 24.90 | 85.80 | 12707.03 | false | 0.249042;0.249037;0.249072;0.248913;0.248957 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 152867008;152322720;153622208;150131104;150815296 | 161829216;158362592;153849600;153652064;155171776 | |
216 | resnetv23_stage3_conv42_fwd | Convolution | [256,256,14,14] | 1424257.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058381;0.058060;0.058140;0.057899;0.058090 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;7552;2688;2432;2432 | |
217 | resnetv23_stage3__plus13 | elemwise_add | [256,1024,14,14] | 13140.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.00 | 51380224 | 2208501.33 | 2804864.00 | 95.70 | 10.25 | 69.43 | true | 0.956384;0.956897;0.957077;0.956869;0.956285 | 51380224;51380224;51380224;51380224;51380224 | 1173248;2108544;3055424;3401344;1461536 | 1763808;2672192;3652192;4015232;2090208 | |
218 | resnetv23_stage3_batchnorm42_fwd | BatchNorm | [256,1024,14,14] | 25050.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 59368160.00 | 62808608.00 | 90.10 | 2.66 | 585.69 | true | 0.900730;0.900054;0.900574;0.900706;0.900566 | 325058560;325058560;325058560;325058560;325058560 | 58293120;61524896;58293920;61517440;51807552 | 62827616;62800704;62797504;69788960;55816768 | |
219 | resnetv23_stage3_activation42 | Activation | [256,1024,14,14] | 10790.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 539.33 | 102760448 | 65297824.00 | 64220778.67 | 98.70 | 0.79 | 190.53 | true | 0.987473;0.987077;0.987058;0.987397;0.987104 | 102760448;102760448;102760448;102760448;102760448 | 64221792;64219968;70645312;64219968;64220576 | 61014144;64230816;73859488;64225696;67436960 | |
220 | resnetv23_stage3_conv43_fwd | Convolution | [256,1024,14,14] | 1475851 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.00 | 26332364800 | 158962293.33 | 37768821.33 | 24.70 | 133.85 | 12569.15 | false | 0.247298;0.247182;0.247411;0.248159;0.247478 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 155110464;159719104;159918720;159738912;157428864 | 37728640;37653280;37847328;37801696;37776128 | |
220 | resnetv23_stage3_conv43_fwd | Convolution | [256,1024,14,14] | 1475851 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.058082;0.058090;0.058048;0.057924;0.058082 | 0;0;0;0;0 | 2688;2432;3712;2688;2432 | 96;96;96;96;96 | |
221 | resnetv23_stage3_batchnorm43_fwd | BatchNorm | [256,256,14,14] | 6265.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52042432.00 | 55780277.33 | 88.20 | 0.75 | 565.65 | true | 0.882501;0.882218;0.883021;0.882493;0.882001 | 81264640;81264640;81264640;81264640;81264640 | 52061632;52043648;52042528;52041120;52037664 | 55626592;55791392;55768032;55784064;55788736 | |
222 | resnetv23_stage3_activation43 | Activation | [256,256,14,14] | 2751.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51374261.33 | 95.80 | 0.25 | 187.98 | true | 0.956813;0.958299;0.958170;0.957555;0.958224 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380608;51380576 | 51378528;51377888;51373824;51367232;51371072 | |
223 | resnetv23_stage3_conv44_fwd | Convolution | [256,256,14,14] | 3395466.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1505.33 | 19365101568 | 86974357.33 | 77144448.00 | 24.70 | 117.99 | 12864.33 | false | 0.246616;0.247110;0.246907;0.246649;0.246887 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 88195712;87151904;85575456;85333760;93035968 | 77607104;76397760;77428480;75151616;78511968 | |
223 | resnetv23_stage3_conv44_fwd | Convolution | [256,256,14,14] | 3395466.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.67 | 251658240 | 51377397.33 | 150584053.33 | 47.10 | 1.25 | 909.61 | true | 0.471056;0.471411;0.470507;0.471495;0.471154 | 251658240;251658240;251658240;251658240;251658240 | 51371744;51390176;51358240;51391520;51370272 | 150568768;150545760;150576448;150606944;150758752 | |
223 | resnetv23_stage3_conv44_fwd | Convolution | [256,256,14,14] | 3395466.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.00 | 269484032 | 151347125.33 | 54206752.00 | 48.20 | 1.31 | 1036.48 | true | 0.482185;0.482378;0.482046;0.482084;0.482544 | 269484032;269484032;269484032;269484032;269484032 | 151338368;151358976;151342848;151351328;151347200 | 54229184;54277600;54186752;54204320;54111968 | |
223 | resnetv23_stage3_conv44_fwd | Convolution | [256,256,14,14] | 3395466.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.67 | 5308416 | 2359552.00 | 9339232.00 | 30.90 | 0.45 | 318.50 | true | 0.310649;0.306060;0.306902;0.308403;0.312631 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2361600;2359552;2359552 | 9358080;9384864;9340704;9318912;9168512 | |
224 | resnetv23_stage3_batchnorm44_fwd | BatchNorm | [256,256,14,14] | 6340 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51776533.33 | 55882048.00 | 88.20 | 0.75 | 568.28 | true | 0.882516;0.881548;0.882247;0.881633;0.882964 | 81264640;81264640;81264640;81264640;81264640 | 55881504;55893312;55892608;55838944;55872032 | 51777696;51774272;51762560;51789728;51777632 | |
225 | resnetv23_stage3_activation44 | Activation | [256,256,14,14] | 2755.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 137.00 | 25690112 | 51380586.67 | 51378122.67 | 95.70 | 0.25 | 187.52 | true | 0.957496;0.956648;0.955411;0.958307;0.958024 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380608;51380576;51385952 | 51363392;51396832;51372320;51365216;51408800 | |
226 | resnetv23_stage3_conv45_fwd | Convolution | [256,256,14,14] | 1415671.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2083.00 | 26409435136 | 156834506.67 | 152590858.67 | 24.90 | 85.35 | 12678.56 | false | 0.248823;0.249009;0.249029;0.248997;0.248922 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 153906784;159778976;155879872;155593376;159030272 | 151870016;153731328;150737216;152171232;155101312 | |
226 | resnetv23_stage3_conv45_fwd | Convolution | [256,256,14,14] | 1415671.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.056427;0.058078;0.058046;0.057933;0.058070 | 0;0;0;0;0 | 96;96;2656;96;96 | 2432;2688;11136;2432;2432 | |
227 | resnetv23_stage3__plus14 | elemwise_add | [256,1024,14,14] | 13446.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 3530506.67 | 4092512.00 | 95.70 | 6.74 | 69.34 | true | 0.956573;0.956857;0.956970;0.957469;0.956997 | 51380224;51380224;51380224;51380224;51380224 | 2947584;128;3128256;4515680;4766592 | 3538080;896;3657952;5081504;5349344 | |
228 | resnetv23_stage3_batchnorm45_fwd | BatchNorm | [256,1024,14,14] | 25887 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 57211914.67 | 61636213.33 | 90.00 | 2.74 | 585.69 | true | 0.900039;0.899940;0.900150;0.900394;0.900791 | 325058560;325058560;325058560;325058560;325058560 | 55813216;69771808;66289056;62799264;55820320 | 48571584;64767040;61525184;58291680;51818880 | |
229 | resnetv23_stage3_activation45 | Activation | [256,1024,14,14] | 10787.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.00 | 102760448 | 67436842.67 | 68504714.67 | 98.70 | 0.76 | 189.95 | true | 0.986925;0.986915;0.987160;0.987198;0.987106 | 102760448;102760448;102760448;102760448;102760448 | 73859200;64225632;67436672;61014432;70648224 | 73855008;64222016;70644640;64221952;70647488 | |
230 | resnetv23_stage3_conv46_fwd | Convolution | [256,1024,14,14] | 1474843 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2092.67 | 26332364800 | 157870293.33 | 37760288.00 | 24.80 | 134.60 | 12583.16 | false | 0.247049;0.247520;0.247939;0.247416;0.247873 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 164839808;157153632;157757344;157621824;158231712 | 37772512;37714080;37794272;37815488;37667488 | |
230 | resnetv23_stage3_conv46_fwd | Convolution | [256,1024,14,14] | 1474843 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2442.67 | 5.80 | 0.00 | 0.00 | true | 0.058077;0.058051;0.058104;0.057897;0.058060 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2464;2688;2432;2176 | |
231 | resnetv23_stage3_batchnorm46_fwd | BatchNorm | [256,256,14,14] | 6303.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52038517.33 | 55761984.00 | 88.30 | 0.75 | 566.96 | true | 0.884045;0.882085;0.882653;0.882502;0.883549 | 81264640;81264640;81264640;81264640;81264640 | 55755072;55769632;55730048;55799200;55761248 | 52053344;52043488;52030656;52024608;52041408 | |
232 | resnetv23_stage3_activation46 | Activation | [256,256,14,14] | 2765.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51368032.00 | 95.70 | 0.25 | 188.44 | true | 0.957581;0.956852;0.956544;0.956411;0.957669 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51373920;51368672;51366720;51368704;51360736 | |
233 | resnetv23_stage3_conv47_fwd | Convolution | [256,256,14,14] | 3396495.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1511.67 | 19365101568 | 83297322.67 | 77297674.67 | 24.70 | 120.58 | 12810.43 | false | 0.247175;0.246713;0.247056;0.246484;0.247160 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 78815136;77935040;87932320;84493824;86583008 | 75560384;77564192;77787232;77457536;76871296 | |
233 | resnetv23_stage3_conv47_fwd | Convolution | [256,256,14,14] | 3396495.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 277.00 | 251658240 | 51386805.33 | 150607104.00 | 47.10 | 1.25 | 908.51 | true | 0.471654;0.470867;0.471241;0.471400;0.471206 | 251658240;251658240;251658240;251658240;251658240 | 150566240;150576864;150623552;150626016;150620896 | 51394336;51383264;51400672;51382816;51352672 | |
233 | resnetv23_stage3_conv47_fwd | Convolution | [256,256,14,14] | 3396495.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 258.33 | 269484032 | 151366933.33 | 54309738.67 | 48.20 | 1.31 | 1043.17 | true | 0.482186;0.482450;0.482543;0.482201;0.482565 | 269484032;269484032;269484032;269484032;269484032 | 54085472;54326080;54242848;54360288;54392608 | 151357792;151309472;151366848;151387456;151376160 | |
233 | resnetv23_stage3_conv47_fwd | Convolution | [256,256,14,14] | 3396495.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9314325.33 | 30.60 | 0.45 | 325.01 | true | 0.313890;0.304631;0.301153;0.302449;0.310743 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2362624 | 9391264;9329408;9308704;9304864;9298976 | |
234 | resnetv23_stage3_batchnorm47_fwd | BatchNorm | [256,256,14,14] | 6539.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 51765664.00 | 55956042.67 | 88.20 | 0.75 | 565.65 | true | 0.881747;0.881733;0.881905;0.882274;0.882388 | 81264640;81264640;81264640;81264640;81264640 | 51768192;51767008;51766720;51763264;51753728 | 55910592;55942752;55962208;55966336;55963168 | |
235 | resnetv23_stage3_activation47 | Activation | [256,256,14,14] | 2810.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380586.67 | 51372117.33 | 95.80 | 0.25 | 188.44 | true | 0.957090;0.957779;0.957782;0.957967;0.956047 | 25690112;25690112;25690112;25690112;25690112 | 51387232;51380576;51380576;51380608;51380576 | 51388160;51374016;51361536;51372448;51369888 | |
236 | resnetv23_stage3_conv48_fwd | Convolution | [256,256,14,14] | 1421380.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2079.33 | 26409435136 | 156367989.33 | 151433738.67 | 24.90 | 85.80 | 12700.92 | false | 0.248928;0.249010;0.248988;0.248915;0.248876 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 154980160;154624416;156779104;157344704;157702720 | 150709152;150523456;151755520;151836544;152400960 | |
236 | resnetv23_stage3_conv48_fwd | Convolution | [256,256,14,14] | 1421380.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 437.33 | 4565.33 | 5.80 | 0.00 | 0.00 | true | 0.058043;0.058058;0.058068;0.057886;0.058061 | 0;0;0;0;0 | 96;2144;96;96;1120 | 8320;6784;2432;2432;4480 | |
237 | resnetv23_stage3__plus15 | elemwise_add | [256,1024,14,14] | 13207.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.33 | 51380224 | 2537322.67 | 3093973.33 | 95.70 | 9.12 | 69.40 | true | 0.956613;0.956807;0.957032;0.956477;0.956585 | 51380224;51380224;51380224;51380224;51380224 | 4053504;4931680;465120;128;3093344 | 4673280;5506464;947872;416;3660768 | |
238 | resnetv23_stage3_batchnorm48_fwd | BatchNorm | [256,1024,14,14] | 25021.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 70157226.67 | 74423680.00 | 90.00 | 2.25 | 585.69 | true | 0.900317;0.900346;0.900532;0.900379;0.900457 | 325058560;325058560;325058560;325058560;325058560 | 80961696;61522464;67987520;84184000;61508448 | 87217312;66288096;69765632;90746272;62798720 | |
239 | resnetv23_stage3_activation48 | Activation | [256,1024,14,14] | 10761.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 539.67 | 102760448 | 77070741.33 | 75995381.33 | 98.70 | 0.67 | 190.41 | true | 0.987857;0.986752;0.987389;0.986962;0.987391 | 102760448;102760448;102760448;102760448;102760448 | 54591616;83493216;80282016;83493280;67436992 | 54590432;83486560;77067424;83485600;67433120 | |
240 | resnetv23_stage3_conv49_fwd | Convolution | [256,1024,14,14] | 1476897.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2094.67 | 26332364800 | 156500181.33 | 37802570.67 | 24.70 | 135.52 | 12571.15 | false | 0.248446;0.247255;0.247381;0.247299;0.247301 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 154067136;159927168;154610496;155052768;159837280 | 37817024;37805632;37804480;37797600;37793120 | |
240 | resnetv23_stage3_conv49_fwd | Convolution | [256,1024,14,14] | 1476897.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058449;0.058090;0.058061;0.057894;0.058090 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2432;2432;2432 | |
241 | resnetv23_stage3_batchnorm49_fwd | BatchNorm | [256,256,14,14] | 6280 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52034069.33 | 55765877.33 | 88.30 | 0.75 | 565.65 | true | 0.883026;0.883014;0.882513;0.882582;0.882283 | 81264640;81264640;81264640;81264640;81264640 | 52004544;52032544;52038368;52037632;52032032 | 55828576;55780320;55759136;55758176;55741920 | |
242 | resnetv23_stage3_activation49 | Activation | [256,256,14,14] | 2726.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380608.00 | 51374069.33 | 95.80 | 0.25 | 188.90 | true | 0.957696;0.957158;0.957251;0.958065;0.957690 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51385696;51380640;51380576 | 51373440;51375904;51381856;51366368;51372864 | |
243 | resnetv23_stage3_conv50_fwd | Convolution | [256,256,14,14] | 3398330.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.67 | 19365101568 | 83977962.67 | 77023114.67 | 24.70 | 120.28 | 12930.18 | false | 0.246401;0.247012;0.246893;0.246845;0.246850 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 85019520;86684768;82269760;80838464;84644608 | 74265568;77685824;77689504;77631328;75752192 | |
243 | resnetv23_stage3_conv50_fwd | Convolution | [256,256,14,14] | 3398330.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.67 | 251658240 | 51390410.67 | 150608341.33 | 47.10 | 1.25 | 909.61 | true | 0.470661;0.470978;0.470729;0.470010;0.470789 | 251658240;251658240;251658240;251658240;251658240 | 51393248;51387872;51397664;51390112;51366432 | 150582304;150643936;150593216;150587872;150650848 | |
243 | resnetv23_stage3_conv50_fwd | Convolution | [256,256,14,14] | 3398330.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.67 | 269484032 | 151342058.67 | 54164533.33 | 48.20 | 1.31 | 1033.82 | true | 0.481811;0.482297;0.481523;0.482240;0.481652 | 269484032;269484032;269484032;269484032;269484032 | 151351808;151316352;151340672;151353920;151333696 | 54263456;54124736;54155520;54213344;54073120 | |
243 | resnetv23_stage3_conv50_fwd | Convolution | [256,256,14,14] | 3398330.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.33 | 5308416 | 2359552.00 | 9302400.00 | 30.50 | 0.46 | 325.01 | true | 0.310607;0.306357;0.300501;0.301486;0.305735 | 5308416;5308416;5308416;5308416;5308416 | 2360064;2359552;2359552;2359552;2359552 | 9364992;9259520;9293952;9353728;9252128 | |
244 | resnetv23_stage3_batchnorm50_fwd | BatchNorm | [256,256,14,14] | 6350.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51774165.33 | 55879136.00 | 88.20 | 0.75 | 568.28 | true | 0.881717;0.882113;0.882090;0.881873;0.881776 | 81264640;81264640;81264640;81264640;81264640 | 51796992;51794464;51750944;51766240;51761792 | 55883264;55869728;55869056;55884416;55932192 | |
245 | resnetv23_stage3_activation50 | Activation | [256,256,14,14] | 2736 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380586.67 | 51372832.00 | 95.70 | 0.25 | 187.98 | true | 0.956583;0.958995;0.956166;0.954821;0.956970 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380608;51380576;51380576 | 51371392;51374336;51374464;51370816;51372768 | |
246 | resnetv23_stage3_conv51_fwd | Convolution | [256,256,14,14] | 1415951 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2076.33 | 26409435136 | 157541258.67 | 153043776.00 | 24.90 | 85.03 | 12719.27 | false | 0.248922;0.249007;0.248961;0.249015;0.248907 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 160640832;155836480;164593120;152657856;156146464 | 153995040;153355040;155399104;149736992;151781248 | |
246 | resnetv23_stage3_conv51_fwd | Convolution | [256,256,14,14] | 1415951 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058087;0.058087;0.058060;0.057902;0.058048 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2432;2688;2432 | |
247 | resnetv23_stage3__plus16 | elemwise_add | [256,1024,14,14] | 13235.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 2470282.67 | 3050954.67 | 95.70 | 9.31 | 69.34 | true | 0.956795;0.956979;0.957190;0.957066;0.956639 | 51380224;51380224;51380224;51380224;51380224 | 128;2281984;1597312;3531552;4750560 | 928;2883168;2168384;4101312;5286272 | |
248 | resnetv23_stage3_batchnorm51_fwd | BatchNorm | [256,1024,14,14] | 25558.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 554.33 | 325058560 | 64770741.33 | 70954677.33 | 90.00 | 2.39 | 586.40 | true | 0.900277;0.900244;0.900322;0.900075;0.900170 | 325058560;325058560;325058560;325058560;325058560 | 68000032;61548992;48572928;207268288;64763200 | 76766144;66306752;55815264;223269664;69791136 | |
249 | resnetv23_stage3_activation51 | Activation | [256,1024,14,14] | 10891.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 538.67 | 102760448 | 76000202.67 | 74921973.33 | 98.70 | 0.68 | 190.77 | true | 0.987524;0.986982;0.987257;0.987467;0.987643 | 102760448;102760448;102760448;102760448;102760448 | 83493216;70648160;73859232;89915744;67436960 | 83484960;70639456;70641504;89907712;64222432 | |
250 | resnetv23_stage3_conv52_fwd | Convolution | [256,1024,14,14] | 1476858 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.67 | 26332364800 | 158086634.67 | 37707509.33 | 24.80 | 134.49 | 12565.15 | false | 0.247712;0.247669;0.246865;0.247642;0.247515 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 37236864;37708736;37662912;37822464;37750880 | 168735648;156720224;160767040;156772640;154025440 | |
250 | resnetv23_stage3_conv52_fwd | Convolution | [256,1024,14,14] | 1476858 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058048;0.058041;0.058070;0.057899;0.058102 | 0;0;0;0;0 | 2432;2432;2432;2432;2688 | 96;96;96;96;96 | |
251 | resnetv23_stage3_batchnorm52_fwd | BatchNorm | [256,256,14,14] | 6348.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52039264.00 | 55783829.33 | 88.30 | 0.75 | 566.96 | true | 0.883170;0.882756;0.882906;0.882437;0.882311 | 81264640;81264640;81264640;81264640;81264640 | 52014880;52064256;52040832;52038720;52038240 | 55843488;55694016;55748544;55782528;55820416 | |
252 | resnetv23_stage3_activation52 | Activation | [256,256,14,14] | 2742.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 135.67 | 25690112 | 51380576.00 | 51371104.00 | 95.80 | 0.25 | 189.36 | true | 0.955173;0.958098;0.957697;0.958172;0.957593 | 25690112;25690112;25690112;25690112;25690112 | 51379232;51370112;51365280;51377920;51363360 | 51380576;51380576;51380576;51382624;51380576 | |
253 | resnetv23_stage3_conv53_fwd | Convolution | [256,256,14,14] | 3396475.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.00 | 19365101568 | 84308416.00 | 77241600.00 | 24.70 | 119.87 | 12927.30 | false | 0.246849;0.247336;0.246504;0.246971;0.246753 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 82888032;92431232;87365600;81060032;82671616 | 77204832;75375264;77879840;76640128;78847136 | |
253 | resnetv23_stage3_conv53_fwd | Convolution | [256,256,14,14] | 3396475.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.33 | 251658240 | 51367200.00 | 150586432.00 | 47.10 | 1.25 | 910.71 | true | 0.471137;0.471648;0.470861;0.471198;0.470706 | 251658240;251658240;251658240;251658240;251658240 | 51421216;51370592;51349152;51365152;51365856 | 150573760;150596544;150525664;150588992;150645728 | |
253 | resnetv23_stage3_conv53_fwd | Convolution | [256,256,14,14] | 3396475.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 259.33 | 269484032 | 151369610.67 | 54348618.67 | 48.20 | 1.31 | 1039.14 | true | 0.482546;0.482377;0.482632;0.482290;0.482406 | 269484032;269484032;269484032;269484032;269484032 | 151402336;151381632;151338176;151356160;151371040 | 54387744;54378112;54280000;54246240;54392960 | |
253 | resnetv23_stage3_conv53_fwd | Convolution | [256,256,14,14] | 3396475.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.00 | 5308416 | 2359552.00 | 9358976.00 | 30.40 | 0.45 | 331.78 | true | 0.312249;0.299254;0.305588;0.300809;0.306541 | 5308416;5308416;5308416;5308416;5308416 | 9367296;9341440;9396384;9368192;9285248 | 2365440;2359552;2359552;2359552;2359552 | |
254 | resnetv23_stage3_batchnorm53_fwd | BatchNorm | [256,256,14,14] | 6370.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51761365.33 | 55924202.67 | 88.20 | 0.75 | 568.28 | true | 0.882094;0.882218;0.882485;0.882279;0.882299 | 81264640;81264640;81264640;81264640;81264640 | 51745984;51771616;51764064;51750784;51769248 | 55909120;55953728;55915360;55948128;55904736 | |
255 | resnetv23_stage3_activation53 | Activation | [256,256,14,14] | 2804.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380586.67 | 51370634.67 | 95.70 | 0.25 | 188.44 | true | 0.958317;0.957633;0.957142;0.956294;0.956711 | 25690112;25690112;25690112;25690112;25690112 | 51372864;51366080;51372352;51367136;51372416 | 51382624;51380608;51380576;51380576;51380576 | |
256 | resnetv23_stage3_conv54_fwd | Convolution | [256,256,14,14] | 1421100.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2079.00 | 26409435136 | 156404693.33 | 153529216.00 | 24.90 | 85.21 | 12702.95 | false | 0.248825;0.249044;0.249017;0.248617;0.248895 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 153187264;157156224;156740672;155878400;156595008 | 150890688;154960224;154320576;152215168;154051904 | |
256 | resnetv23_stage3_conv54_fwd | Convolution | [256,256,14,14] | 1421100.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2474.67 | 5.80 | 0.00 | 0.00 | true | 0.058061;0.058036;0.058106;0.057889;0.058104 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2560;2688;2432;2432 | |
257 | resnetv23_stage3__plus17 | elemwise_add | [256,1024,14,14] | 13162.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.67 | 51380224 | 957280.00 | 1178506.67 | 95.70 | 24.06 | 69.37 | false | 0.956628;0.956688;0.956368;0.957003;0.956661 | 51380224;51380224;51380224;51380224;51380224 | 128;2871584;5755712;128;128 | 800;3534304;6335488;416;384 | |
258 | resnetv23_stage3_batchnorm54_fwd | BatchNorm | [256,1024,14,14] | 25072.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 62600192.00 | 66300522.67 | 90.00 | 2.52 | 585.69 | true | 0.900414;0.900101;0.900334;0.900320;0.900341 | 325058560;325058560;325058560;325058560;325058560 | 55047552;64761088;61519744;74470496;61519744 | 55841920;69791168;62814400;80253376;66296000 | |
259 | resnetv23_stage3_activation54 | Activation | [256,1024,14,14] | 10762.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.67 | 102760448 | 79211552.00 | 78134613.33 | 98.70 | 0.65 | 190.06 | true | 0.987559;0.986871;0.987185;0.987052;0.987030 | 102760448;102760448;102760448;102760448;102760448 | 86704480;86704480;67436960;80282016;70648160 | 83486272;83483488;64219392;80278400;70641952 | |
260 | resnetv23_stage3_conv55_fwd | Convolution | [256,1024,14,14] | 1475707 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.33 | 26332364800 | 156248554.67 | 37781994.67 | 24.80 | 135.71 | 12561.16 | false | 0.247464;0.247502;0.248490;0.247997;0.247687 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 156127744;161826688;155468512;157149408;153179744 | 37804800;37834144;37745280;37789760;37751424 | |
260 | resnetv23_stage3_conv55_fwd | Convolution | [256,1024,14,14] | 1475707 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2602.67 | 5.80 | 0.00 | 0.00 | true | 0.058152;0.058082;0.058099;0.057902;0.058072 | 0;0;0;0;0 | 96;96;352;96;96 | 2688;2432;5760;2688;2432 | |
261 | resnetv23_stage3_batchnorm55_fwd | BatchNorm | [256,256,14,14] | 6311.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52037354.67 | 55747776.00 | 88.30 | 0.75 | 565.65 | true | 0.882141;0.883032;0.882404;0.883235;0.882762 | 81264640;81264640;81264640;81264640;81264640 | 52052384;52024064;52035808;52046720;52029536 | 55749888;55771168;55722272;55717024;55799712 | |
262 | resnetv23_stage3_activation55 | Activation | [256,256,14,14] | 2750.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 137.00 | 25690112 | 51380586.67 | 51375445.33 | 95.70 | 0.25 | 187.52 | true | 0.956034;0.956584;0.957782;0.955918;0.957201 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380608;51380576;51383936;51380576 | 51378016;51374208;51367712;51376256;51375872 | |
263 | resnetv23_stage3_conv56_fwd | Convolution | [256,256,14,14] | 3397113 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1498.67 | 19365101568 | 82060629.33 | 77425930.67 | 24.70 | 121.42 | 12921.55 | false | 0.246461;0.246722;0.246923;0.247171;0.247126 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 80052192;84185248;81944448;85206016;78788672 | 77100800;75687008;77973696;77687328;77489664 | |
263 | resnetv23_stage3_conv56_fwd | Convolution | [256,256,14,14] | 3397113 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.67 | 251658240 | 51400693.33 | 150601386.67 | 47.10 | 1.25 | 909.61 | true | 0.471623;0.471172;0.470310;0.471111;0.471292 | 251658240;251658240;251658240;251658240;251658240 | 51376544;51406560;51405152;51405856;51391072 | 150605664;150688352;150595680;150602816;150588512 | |
263 | resnetv23_stage3_conv56_fwd | Convolution | [256,256,14,14] | 3397113 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.00 | 269484032 | 151333472.00 | 54209728.00 | 48.20 | 1.31 | 1036.48 | true | 0.482064;0.481878;0.482243;0.482619;0.482935 | 269484032;269484032;269484032;269484032;269484032 | 151365664;151311936;151350336;151338144;151273024 | 54197152;54250432;54231168;54061344;54200864 | |
263 | resnetv23_stage3_conv56_fwd | Convolution | [256,256,14,14] | 3397113 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9302240.00 | 30.80 | 0.46 | 312.26 | true | 0.314690;0.305102;0.309574;0.309374;0.300616 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9303328;9258624;9328384;9275008;9338368 | |
264 | resnetv23_stage3_batchnorm56_fwd | BatchNorm | [256,256,14,14] | 6340.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51774336.00 | 55901376.00 | 88.20 | 0.75 | 568.28 | true | 0.881364;0.881919;0.881711;0.882310;0.882126 | 81264640;81264640;81264640;81264640;81264640 | 51765312;51784384;51772928;51781696;51768384 | 55913280;55900416;55890432;55919456;55874720 | |
265 | resnetv23_stage3_activation56 | Activation | [256,256,14,14] | 2754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380576.00 | 51371456.00 | 95.80 | 0.25 | 187.98 | true | 0.958024;0.957821;0.957209;0.956637;0.958616 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51382368;51380576 | 51381824;51369856;51369344;51364832;51375168 | |
266 | resnetv23_stage3_conv57_fwd | Convolution | [256,256,14,14] | 1418310.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2079.33 | 26409435136 | 154008330.67 | 150993664.00 | 24.90 | 86.59 | 12700.92 | false | 0.249006;0.249008;0.248980;0.248916;0.248891 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 155425600;153970240;153820032;152268736;154234720 | 151496736;151583776;150656192;149820992;150828064 | |
266 | resnetv23_stage3_conv57_fwd | Convolution | [256,256,14,14] | 1418310.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058065;0.058082;0.058070;0.057912;0.058060 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;2432;2432;2432 | |
267 | resnetv23_stage3__plus18 | elemwise_add | [256,1024,14,14] | 13251.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 3064682.67 | 3650954.67 | 95.70 | 7.65 | 69.34 | true | 0.957266;0.957297;0.956931;0.957075;0.956971 | 51380224;51380224;51380224;51380224;51380224 | 128;1340544;3717344;5629344;4136160 | 512;1933728;4284448;6189824;4734688 | |
268 | resnetv23_stage3_batchnorm57_fwd | BatchNorm | [256,1024,14,14] | 25608 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 58288256.00 | 63957792.00 | 90.00 | 2.66 | 585.69 | true | 0.900039;0.900132;0.900322;0.900343;0.900511 | 325058560;325058560;325058560;325058560;325058560 | 55055968;58278944;84193280;55042752;61529856 | 55824896;62784640;90696704;62806400;66282336 | |
269 | resnetv23_stage3_activation57 | Activation | [256,1024,14,14] | 10786.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.67 | 102760448 | 85634090.67 | 84557994.67 | 98.70 | 0.60 | 189.71 | true | 0.987359;0.987275;0.987778;0.986634;0.987554 | 102760448;102760448;102760448;102760448;102760448 | 93127040;77070688;86704544;105972064;57803168 | 89911456;77065856;86696672;109179552;57802432 | |
270 | resnetv23_stage3_conv58_fwd | Convolution | [256,1024,14,14] | 1477276 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.33 | 26332364800 | 161254645.33 | 37825397.33 | 24.70 | 132.27 | 12567.15 | false | 0.247446;0.247607;0.247020;0.247165;0.248159 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 162602528;164235968;162856064;158305344;156499360 | 37764320;37899648;37820256;37806816;37849120 | |
270 | resnetv23_stage3_conv58_fwd | Convolution | [256,1024,14,14] | 1477276 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.058046;0.058032;0.058102;0.057893;0.058333 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;7552;2688;2432;2432 | |
271 | resnetv23_stage3_batchnorm58_fwd | BatchNorm | [256,256,14,14] | 6365.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.67 | 81264640 | 52041930.67 | 55776181.33 | 88.30 | 0.75 | 565.65 | true | 0.882700;0.882400;0.882790;0.882809;0.882678 | 81264640;81264640;81264640;81264640;81264640 | 52047488;52021056;52038048;52051456;52040256 | 55692640;55792576;55750496;55786016;55792032 | |
272 | resnetv23_stage3_activation58 | Activation | [256,256,14,14] | 2771.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51373226.67 | 95.80 | 0.25 | 188.44 | true | 0.958407;0.957835;0.956808;0.958440;0.956959 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380576;51380576;51380576;51380576 | 51381920;51372704;51385056;51359424;51365056 | |
273 | resnetv23_stage3_conv59_fwd | Convolution | [256,256,14,14] | 3442448.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1499.00 | 19365101568 | 84567018.67 | 77287914.67 | 24.70 | 119.64 | 12918.68 | false | 0.247164;0.246913;0.246988;0.246862;0.246605 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 76236064;77675872;77343584;77180864;77339296 | 84980896;84555808;88813152;82192416;84164352 | |
273 | resnetv23_stage3_conv59_fwd | Convolution | [256,256,14,14] | 3442448.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 277.33 | 251658240 | 51381216.00 | 150607189.33 | 47.10 | 1.25 | 907.42 | true | 0.470403;0.470900;0.470797;0.470902;0.471484 | 251658240;251658240;251658240;251658240;251658240 | 150566240;150623808;150614368;150583392;150667072 | 51396064;51398624;51377440;51370144;51361696 | |
273 | resnetv23_stage3_conv59_fwd | Convolution | [256,256,14,14] | 3442448.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 258.33 | 269484032 | 151369632.00 | 54404149.33 | 48.30 | 1.31 | 1043.17 | true | 0.483067;0.482732;0.482021;0.482167;0.482884 | 269484032;269484032;269484032;269484032;269484032 | 151383712;151371200;151399680;151348512;151353984 | 54450272;54409024;54439264;54364160;54360192 | |
273 | resnetv23_stage3_conv59_fwd | Convolution | [256,256,14,14] | 3442448.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9297717.33 | 30.50 | 0.46 | 312.26 | true | 0.312531;0.306644;0.303926;0.303179;0.300884 | 5308416;5308416;5308416;5308416;5308416 | 2359808;2359552;2359552;2359552;2359552 | 9328640;9268000;9296512;9358592;9258368 | |
274 | resnetv23_stage3_batchnorm59_fwd | BatchNorm | [256,256,14,14] | 6360.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51776106.67 | 55943733.33 | 88.20 | 0.75 | 568.28 | true | 0.882478;0.882328;0.882408;0.882305;0.881833 | 81264640;81264640;81264640;81264640;81264640 | 51767200;51790208;51787840;51767584;51772896 | 55935360;55887872;55954592;55941248;55969984 | |
275 | resnetv23_stage3_activation59 | Activation | [256,256,14,14] | 2749.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.67 | 25690112 | 51380597.33 | 51366965.33 | 95.70 | 0.25 | 187.98 | true | 0.957471;0.957491;0.956284;0.957731;0.957422 | 25690112;25690112;25690112;25690112;25690112 | 51380608;51380608;51380608;51380576;51380576 | 51365248;51363584;51369952;51373184;51365696 | |
276 | resnetv23_stage3_conv60_fwd | Convolution | [256,256,14,14] | 1425612.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2081.00 | 26409435136 | 155278997.33 | 153862026.67 | 24.90 | 85.43 | 12690.74 | false | 0.248969;0.248977;0.249062;0.248977;0.248943 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 156202368;160751456;152179648;151576320;157454976 | 153374784;153842368;151002176;154368928;154602208 | |
276 | resnetv23_stage3_conv60_fwd | Convolution | [256,256,14,14] | 1425612.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.058036;0.058058;0.058094;0.057931;0.058089 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2432;2432;2688 | |
277 | resnetv23_stage3__plus19 | elemwise_add | [256,1024,14,14] | 13203.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.67 | 51380224 | 1597632.00 | 2003680.00 | 95.70 | 14.27 | 69.37 | true | 0.956649;0.956806;0.956484;0.956766;0.956427 | 51380224;51380224;51380224;51380224;51380224 | 128;128;3939520;2929536;1863232 | 512;1664;4582528;3528064;2481312 | |
278 | resnetv23_stage3_batchnorm60_fwd | BatchNorm | [256,1024,14,14] | 25201.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 64754464.00 | 72112512.00 | 90.00 | 2.37 | 585.69 | true | 0.900674;0.900157;0.900284;0.900557;0.900504 | 325058560;325058560;325058560;325058560;325058560 | 61510304;71230336;61522752;80953664;51802592 | 69765472;76773472;69798592;83732000;55824288 | |
279 | resnetv23_stage3_activation60 | Activation | [256,1024,14,14] | 10841.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.00 | 102760448 | 67436917.33 | 67430890.67 | 98.80 | 0.76 | 190.30 | true | 0.987896;0.987505;0.987056;0.987463;0.987949 | 102760448;102760448;102760448;102760448;102760448 | 73859424;64225632;64225696;61014432;73859424 | 73850464;64220544;64221664;57800672;73855008 | |
280 | resnetv23_stage3_conv61_fwd | Convolution | [256,1024,14,14] | 1476459.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2095.67 | 26332364800 | 158098666.67 | 37807914.67 | 24.80 | 134.41 | 12565.15 | false | 0.247648;0.248123;0.247667;0.247296;0.248549 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 161002976;157096608;155345216;156196416;161824896 | 37818880;37741280;37816832;37801472;37805440 | |
280 | resnetv23_stage3_conv61_fwd | Convolution | [256,1024,14,14] | 1476459.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057897;0.058098;0.058111;0.057921;0.058137 | 0;0;0;0;0 | 2432;2688;2432;2432;2432 | 96;96;96;96;96 | |
281 | resnetv23_stage3_batchnorm61_fwd | BatchNorm | [256,256,14,14] | 6364 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52039338.67 | 55734901.33 | 88.30 | 0.75 | 566.96 | true | 0.882834;0.883095;0.882617;0.882428;0.882841 | 81264640;81264640;81264640;81264640;81264640 | 52027744;52041728;52058080;52038400;52037888 | 55774400;55726624;55733504;55744576;55714720 | |
282 | resnetv23_stage3_activation61 | Activation | [256,256,14,14] | 2770 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.33 | 25690112 | 51380576.00 | 51374560.00 | 95.70 | 0.25 | 188.44 | true | 0.954921;0.958420;0.958863;0.956397;0.957322 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380608;51380576 | 51377568;51364224;51374976;51376096;51372608 | |
283 | resnetv23_stage3_conv62_fwd | Convolution | [256,256,14,14] | 3398904 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.67 | 19365101568 | 85908512.00 | 77575690.67 | 24.70 | 118.45 | 12930.18 | false | 0.246207;0.246717;0.246683;0.246932;0.247059 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 77103296;77858400;78096000;77765376;70830272 | 82787904;89760448;78941408;88564384;86373248 | |
283 | resnetv23_stage3_conv62_fwd | Convolution | [256,256,14,14] | 3398904 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.67 | 251658240 | 51377269.33 | 150575349.33 | 47.10 | 1.25 | 909.61 | true | 0.471056;0.470760;0.471050;0.470353;0.470673 | 251658240;251658240;251658240;251658240;251658240 | 51372384;51395936;51369632;51382240;51377184 | 150641888;150568032;150574016;150584000;150553952 | |
283 | resnetv23_stage3_conv62_fwd | Convolution | [256,256,14,14] | 3398904 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.00 | 269484032 | 151328341.33 | 54199818.67 | 48.20 | 1.31 | 1036.48 | true | 0.482488;0.482670;0.482566;0.482406;0.482400 | 269484032;269484032;269484032;269484032;269484032 | 151319488;151354112;151334560;151326432;151324032 | 54164992;54238784;54215904;54218560;54096352 | |
283 | resnetv23_stage3_conv62_fwd | Convolution | [256,256,14,14] | 3398904 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9358645.33 | 30.30 | 0.45 | 312.26 | true | 0.313044;0.299799;0.302541;0.304837;0.302944 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2372608;2359552 | 9305888;9373824;9344288;9381632;9357824 | |
284 | resnetv23_stage3_batchnorm62_fwd | BatchNorm | [256,256,14,14] | 6416.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51779733.33 | 55858666.67 | 88.20 | 0.75 | 568.28 | true | 0.882408;0.882604;0.881899;0.882512;0.882092 | 81264640;81264640;81264640;81264640;81264640 | 55809536;55854464;55897984;55876576;55844960 | 51778496;51776608;51784096;51784608;51775296 | |
285 | resnetv23_stage3_activation62 | Activation | [256,256,14,14] | 2766.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 137.00 | 25690112 | 51380576.00 | 51378133.33 | 95.70 | 0.25 | 187.52 | true | 0.956436;0.957828;0.956036;0.957389;0.956622 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51385184 | 51375264;51379456;51375072;51379680;51384576 | |
286 | resnetv23_stage3_conv63_fwd | Convolution | [256,256,14,14] | 1417013.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2081.67 | 26409435136 | 156320533.33 | 152893589.33 | 24.90 | 85.41 | 12686.68 | false | 0.248805;0.249077;0.248970;0.248980;0.248959 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 154917504;156789760;154221952;161980128;157254336 | 151130336;152736928;153792064;154777600;152151776 | |
286 | resnetv23_stage3_conv63_fwd | Convolution | [256,256,14,14] | 1417013.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057924;0.058080;0.058243;0.057937;0.058056 | 0;0;0;0;0 | 96;96;96;96;96 | 2688;2432;2432;8320;2176 | |
287 | resnetv23_stage3__plus20 | elemwise_add | [256,1024,14,14] | 13197.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 3656864.00 | 4222794.67 | 95.70 | 6.52 | 69.34 | true | 0.957233;0.957085;0.956926;0.956941;0.956947 | 51380224;51380224;51380224;51380224;51380224 | 182656;4841120;5029792;1099680;5391040 | 411936;5383968;5613888;1670528;5954464 | |
288 | resnetv23_stage3_batchnorm63_fwd | BatchNorm | [256,1024,14,14] | 25289.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 59372778.67 | 63971573.33 | 90.00 | 2.64 | 585.69 | true | 0.900378;0.900023;0.900300;0.900525;0.900329 | 325058560;325058560;325058560;325058560;325058560 | 51809216;61538336;58288960;58291040;61538432 | 55819360;69787008;62806464;62805344;66302912 | |
289 | resnetv23_stage3_activation63 | Activation | [256,1024,14,14] | 10759.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.33 | 102760448 | 77070709.33 | 78136352.00 | 98.70 | 0.66 | 190.18 | true | 0.986356;0.986882;0.987230;0.987429;0.987813 | 102760448;102760448;102760448;102760448;102760448 | 70648160;93127008;99550880;67436960;64225632 | 70644640;93122688;102772032;70641728;64218528 | |
290 | resnetv23_stage3_conv64_fwd | Convolution | [256,1024,14,14] | 1476879.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2093.67 | 26332364800 | 157668650.67 | 37802293.33 | 24.80 | 134.71 | 12577.15 | false | 0.247724;0.248297;0.247468;0.247839;0.247671 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 202378848;156765408;153219616;154921344;161319200 | 40308064;37795392;37790592;37808352;37803136 | |
290 | resnetv23_stage3_conv64_fwd | Convolution | [256,1024,14,14] | 1476879.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057896;0.058080;0.058105;0.057924;0.058100 | 0;0;0;0;0 | 96;96;96;96;96 | 13952;2432;2432;2432;2688 | |
291 | resnetv23_stage3_batchnorm64_fwd | BatchNorm | [256,256,14,14] | 6592.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 144.00 | 81264640 | 52050005.33 | 55731136.00 | 88.30 | 0.75 | 564.34 | true | 0.883386;0.882342;0.881239;0.883031;0.882498 | 81264640;81264640;81264640;81264640;81264640 | 52058144;52027520;52051040;52054688;52044288 | 55726656;55826592;55704864;55749344;55717408 | |
292 | resnetv23_stage3_activation64 | Activation | [256,256,14,14] | 2773.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 137.00 | 25690112 | 51380576.00 | 51369450.67 | 95.80 | 0.25 | 187.52 | true | 0.959162;0.956315;0.957435;0.957515;0.957643 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380576 | 51373024;51372384;51365504;51368640;51367328 | |
293 | resnetv23_stage3_conv65_fwd | Convolution | [256,256,14,14] | 3395783.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1497.33 | 19365101568 | 82581632.00 | 77789269.33 | 24.70 | 120.75 | 12933.06 | false | 0.246618;0.246772;0.247070;0.246645;0.246975 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 91006176;80254048;74403584;82927008;84563840 | 78263168;77455904;78038752;77873152;75813344 | |
293 | resnetv23_stage3_conv65_fwd | Convolution | [256,256,14,14] | 3395783.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 276.67 | 251658240 | 51381344.00 | 150620330.67 | 47.20 | 1.25 | 909.61 | true | 0.471802;0.472013;0.471632;0.470288;0.471198 | 251658240;251658240;251658240;251658240;251658240 | 150585792;150616160;150621280;150693472;150623552 | 51394528;51375840;51376032;51383776;51384224 | |
293 | resnetv23_stage3_conv65_fwd | Convolution | [256,256,14,14] | 3395783.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 258.33 | 269484032 | 151364202.67 | 54293162.67 | 48.30 | 1.31 | 1043.17 | true | 0.481959;0.483576;0.482552;0.482966;0.482613 | 269484032;269484032;269484032;269484032;269484032 | 151357984;151380800;151319584;151369056;151365568 | 54309376;54299776;54425504;54176480;54270336 | |
293 | resnetv23_stage3_conv65_fwd | Convolution | [256,256,14,14] | 3395783.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 16.00 | 5308416 | 2359552.00 | 9328266.67 | 30.70 | 0.45 | 331.78 | true | 0.317582;0.306580;0.306454;0.307516;0.305127 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9349248;9324960;9329280;9268768;9330560 | |
294 | resnetv23_stage3_batchnorm65_fwd | BatchNorm | [256,256,14,14] | 6463 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 51767381.33 | 55947285.33 | 88.30 | 0.75 | 566.96 | true | 0.882313;0.882484;0.882821;0.882518;0.883129 | 81264640;81264640;81264640;81264640;81264640 | 51762496;51770016;51769728;51763680;51768736 | 55965120;55953024;55937664;55907232;55951168 | |
295 | resnetv23_stage3_activation65 | Activation | [256,256,14,14] | 2804.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51365642.67 | 95.70 | 0.25 | 188.90 | true | 0.956852;0.956410;0.953830;0.957659;0.957623 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380608;51380576 | 51363296;51355712;51374560;51374528;51359104 | |
296 | resnetv23_stage3_conv66_fwd | Convolution | [256,256,14,14] | 1424297.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2079.67 | 26409435136 | 157403402.67 | 153292170.67 | 24.90 | 85.00 | 12698.88 | false | 0.248974;0.249086;0.249006;0.248976;0.248928 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 156825568;157724064;157660576;160559552;154679264 | 154762048;153663424;152477312;153735776;151180704 | |
296 | resnetv23_stage3_conv66_fwd | Convolution | [256,256,14,14] | 1424297.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 2432.00 | 5.80 | 0.00 | 0.00 | true | 0.057940;0.058075;0.058109;0.057920;0.058087 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2432;2688;2432;2432 | |
297 | resnetv23_stage3__plus21 | elemwise_add | [256,1024,14,14] | 13183.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 740.67 | 51380224 | 2089418.67 | 2473002.67 | 95.70 | 11.26 | 69.37 | true | 0.956599;0.956620;0.956247;0.956544;0.956568 | 51380224;51380224;51380224;51380224;51380224 | 4782208;128;128;3985760;2282368 | 5406080;384;352;4560288;2858336 | |
298 | resnetv23_stage3_batchnorm66_fwd | BatchNorm | [256,1024,14,14] | 25343.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 57210314.67 | 59326624.00 | 90.00 | 2.79 | 585.69 | true | 0.900355;0.900008;0.900324;0.900509;0.900250 | 325058560;325058560;325058560;325058560;325058560 | 62805120;59335488;80251072;55839264;55825408 | 61526848;55047296;74479616;55056800;51807040 | |
299 | resnetv23_stage3_activation66 | Activation | [256,1024,14,14] | 10780.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 541.33 | 102760448 | 71718602.67 | 70644448.00 | 98.70 | 0.72 | 189.83 | true | 0.987293;0.986987;0.987335;0.986747;0.986956 | 102760448;102760448;102760448;102760448;102760448 | 73859424;73859424;67436960;67436960;73859680 | 70643776;73854752;70645312;64225152;70644256 | |
300 | resnetv23_stage3_conv67_fwd | Convolution | [256,1024,14,14] | 1477787 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2096.33 | 26332364800 | 160525237.33 | 37687744.00 | 24.80 | 132.85 | 12561.16 | false | 0.247858;0.247180;0.248242;0.247360;0.247988 | 26332364800;26332364800;26332364800;26332364800;26332364800 | 165930208;159152032;157075008;160959104;161464576 | 37600384;37744064;37601504;37833728;37717664 | |
300 | resnetv23_stage3_conv67_fwd | Convolution | [256,1024,14,14] | 1477787 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057870;0.058077;0.058061;0.057962;0.058566 | 0;0;0;0;0 | 6016;2432;2432;2688;2432 | 96;96;96;96;96 | |
301 | resnetv23_stage3_batchnorm67_fwd | BatchNorm | [256,256,14,14] | 6385 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.33 | 81264640 | 52039456.00 | 55765621.33 | 88.20 | 0.75 | 566.96 | true | 0.881880;0.882049;0.882906;0.882807;0.882503 | 81264640;81264640;81264640;81264640;81264640 | 52027520;52039712;52055552;52025344;52051136 | 55812096;55758976;55719584;55806144;55731744 | |
302 | resnetv23_stage3_activation67 | Activation | [256,256,14,14] | 2771.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 136.00 | 25690112 | 51380576.00 | 51373098.67 | 95.70 | 0.25 | 188.90 | true | 0.956969;0.957503;0.957368;0.957707;0.956345 | 25690112;25690112;25690112;25690112;25690112 | 51380576;51380576;51380576;51380576;51380608 | 51371968;51372928;51374400;51371040;51378880 | |
303 | resnetv23_stage3_conv68_fwd | Convolution | [256,256,14,14] | 3397299.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1505.33 | 19365101568 | 81819861.33 | 77054901.33 | 24.70 | 121.89 | 12864.33 | false | 0.246667;0.246791;0.246625;0.246724;0.247012 | 19365101568;19365101568;19365101568;19365101568;19365101568 | 78632064;83004736;83822784;76472416;86158432 | 77837184;77662240;76549984;75949952;76952480 | |
303 | resnetv23_stage3_conv68_fwd | Convolution | [256,256,14,14] | 3397299.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 277.00 | 251658240 | 51387381.33 | 150584437.33 | 47.10 | 1.25 | 908.51 | true | 0.470856;0.470208;0.471037;0.471331;0.470757 | 251658240;251658240;251658240;251658240;251658240 | 51368608;51411616;51403744;51387040;51371360 | 150578240;150590432;150576608;150584640;150610400 | |
303 | resnetv23_stage3_conv68_fwd | Convolution | [256,256,14,14] | 3397299.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 260.67 | 269484032 | 151350794.67 | 54234144.00 | 48.30 | 1.31 | 1033.82 | true | 0.482351;0.482652;0.483029;0.482682;0.482664 | 269484032;269484032;269484032;269484032;269484032 | 151369152;151197920;151357248;151353952;151341184 | 54269088;54197952;54246240;54258240;54085920 | |
303 | resnetv23_stage3_conv68_fwd | Convolution | [256,256,14,14] | 3397299.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 17.00 | 5308416 | 2359552.00 | 9353386.67 | 30.60 | 0.45 | 312.26 | true | 0.308135;0.306994;0.305811;0.304239;0.306180 | 5308416;5308416;5308416;5308416;5308416 | 2359552;2359552;2359552;2359552;2359552 | 9296768;9354368;9364736;9353600;9352192 | |
304 | resnetv23_stage3_batchnorm68_fwd | BatchNorm | [256,256,14,14] | 6324.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 143.00 | 81264640 | 51780000.00 | 55909354.67 | 88.20 | 0.75 | 568.28 | true | 0.882268;0.881610;0.882627;0.881476;0.882025 | 81264640;81264640;81264640;81264640;81264640 | 51776416;51778720;51790624;51784864;51769056 | 55953184;55907200;55905792;55915072;55898368 | |
305 | resnetv23_stage3_activation68 | Activation | [256,256,14,14] | 2761.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 137.00 | 25690112 | 51380576.00 | 51378506.67 | 95.70 | 0.25 | 187.52 | true | 0.957075;0.956809;0.958184;0.958333;0.957003 | 25690112;25690112;25690112;25690112;25690112 | 51388736;51385728;51373184;51372576;51376608 | 51380576;51388000;51380576;51380576;51380576 | |
306 | resnetv23_stage3_conv69_fwd | Convolution | [256,256,14,14] | 1416005.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2082.67 | 26409435136 | 156570389.33 | 152629557.33 | 24.90 | 85.41 | 12680.58 | false | 0.248912;0.249019;0.248929;0.248693;0.249004 | 26409435136;26409435136;26409435136;26409435136;26409435136 | 153654976;153747264;156060032;160916224;159903872 | 152243232;152284608;151470016;153815136;153360832 | |
306 | resnetv23_stage3_conv69_fwd | Convolution | [256,256,14,14] | 1416005.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 96.00 | 2517.33 | 5.80 | 0.00 | 0.00 | true | 0.057872;0.058094;0.058058;0.057892;0.058037 | 0;0;0;0;0 | 96;96;96;96;96 | 2432;2688;9088;2432;2432 | |
307 | resnetv23_stage3__plus22 | elemwise_add | [256,1024,14,14] | 13218.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 741.00 | 51380224 | 5353877.33 | 5930464.00 | 95.70 | 4.55 | 69.34 | true | 0.956756;0.957244;0.956934;0.956714;0.957100 | 51380224;51380224;51380224;51380224;51380224 | 4956864;5148000;6541984;5956768;128 | 5535296;5719968;7115936;6536128;736 | |
308 | resnetv23_stage4_batchnorm0_fwd | BatchNorm | [256,1024,14,14] | 25530.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 555.00 | 325058560 | 60452426.67 | 62812682.67 | 90.00 | 2.64 | 585.69 | true | 0.899795;0.900378;0.900067;0.900197;0.900069 | 325058560;325058560;325058560;325058560;325058560 | 51810944;64764224;61529440;58299616;61528224 | 55817472;69783936;62822336;62814720;62800992 | |
309 | resnetv23_stage4_activation0 | Activation | [256,1024,14,14] | 10764 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 540.67 | 102760448 | 104901642.67 | 104899242.67 | 98.70 | 0.49 | 190.06 | true | 0.986950;0.987126;0.987122;0.987471;0.986633 | 102760448;102760448;102760448;102760448;102760448 | 109176704;102761664;99545984;109178592;102759360 | 105972064;102760800;99549600;105972064;105972064 | |
310 | resnetv23_stage4_conv0_fwd | Convolution | [256,1024,14,14] | 2896540.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 4126.33 | 52664729600 | 155549077.33 | 49033365.33 | 18.00 | 257.43 | 12763.08 | false | 0.179329;0.178480;0.178442;0.182230;0.184573 | 52664729600;52664729600;52664729600;52664729600;52664729600 | 49083136;48956576;49002272;49057184;49040640 | 156007424;154951296;155162784;155477024;158003264 | |
310 | resnetv23_stage4_conv0_fwd | Convolution | [256,1024,14,14] | 2896540.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 4394.67 | 5.80 | 0.00 | 0.00 | true | 0.057908;0.058587;0.058087;0.057940;0.058092 | 0;0;0;0;0 | 96;6752;96;96;96 | 2432;16256;8320;2432;2432 | |
311 | resnetv23_stage4_batchnorm1_fwd | BatchNorm | [256,512,14,14] | 12405 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 281.67 | 162529280 | 103890720.00 | 111676512.00 | 89.20 | 0.75 | 577.03 | true | 0.892636;0.892429;0.892629;0.892110;0.892409 | 162529280;162529280;162529280;162529280;162529280 | 103875776;103895584;103880864;103895808;103895712 | 111739584;111663072;111783616;111626880;111609952 | |
312 | resnetv23_stage4_activation1 | Activation | [256,512,14,14] | 5426 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 269.00 | 51380224 | 102760800.00 | 102752448.00 | 97.50 | 0.25 | 191.00 | true | 0.975689;0.975192;0.975089;0.975471;0.975601 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102752768;102754048;102755456;102748256;102750528 | |
313 | resnetv23_stage4_conv1_fwd | Convolution | [256,512,14,14] | 4145336.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x128_relu_small_nn_v1 | 5498.33 | 59202863104 | 21564149.33 | 9624949.33 | 15.40 | 1898.19 | 10767.42 | false | 0.153635;0.154448;0.152413;0.155192;0.153866 | 59202863104;59202863104;59202863104;59202863104;59202863104 | 21817376;21458912;21632096;21207296;21601440 | 9623648;9613792;9648896;9631872;9619328 | |
313 | resnetv23_stage4_conv1_fwd | Convolution | [256,512,14,14] | 4145336.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 3157.33 | 4.90 | 0.00 | 0.00 | true | 0.048651;0.049084;0.049120;0.048576;0.049157 | 0;0;0;0;0 | 96;5472;96;96;96 | 1280;13568;1536;1280;6656 | |
314 | resnetv23_stage4_batchnorm2_fwd | BatchNorm | [256,512,7,7] | 6421 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 219.00 | 44957696 | 25809962.67 | 26616704.00 | 18.30 | 0.86 | 205.29 | true | 0.184088;0.182476;0.182385;0.183265;0.185026 | 44957696;44957696;44957696;44957696;44957696 | 27048352;26176256;26569280;26849376;26431456 | 25796416;25813536;25814336;25833600;25802016 | |
315 | resnetv23_stage4_activation2 | Activation | [256,512,7,7] | 1359 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.00 | 12845056 | 25690464.00 | 25705514.67 | 94.40 | 0.25 | 186.16 | true | 0.946083;0.939898;0.944787;0.943927;0.944759 | 12845056;12845056;12845056;12845056;12845056 | 25690464;25690464;25690464;25690464;25690464 | 25705248;25707712;25706080;25685056;25705216 | |
316 | resnetv23_stage4_conv2_fwd | Convolution | [256,512,7,7] | 1723331.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2173.67 | 26358054912 | 102856981.33 | 79249290.67 | 24.90 | 144.74 | 12126.08 | false | 0.249213;0.249142;0.249169;0.249240;0.249348 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 100279712;100159328;104073056;104218176;106784384 | 78086688;79611488;78873952;80272832;79262432 | |
316 | resnetv23_stage4_conv2_fwd | Convolution | [256,512,7,7] | 1723331.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 1280.00 | 4.90 | 0.00 | 0.00 | true | 0.048667;0.049084;0.049091;0.048635;0.049277 | 0;0;0;0;0 | 1280;1280;1536;1280;1280 | 96;96;96;96;96 | |
317 | resnetv23_stage4_conv3_fwd | Convolution | [256,1024,14,14] | 3513134.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x128_relu_interior_nn_v1 | 4424.33 | 52664729600 | 276524544.00 | 48031040.00 | 16.20 | 162.27 | 11903.43 | false | 0.164333;0.161129;0.161521;0.161998;0.163123 | 52664729600;52664729600;52664729600;52664729600;52664729600 | 290955392;272682464;277352416;274454976;277766240 | 47658016;49251808;48241216;48193888;45294656 | |
317 | resnetv23_stage4_conv3_fwd | Convolution | [256,1024,14,14] | 3513134.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 192.00 | 1152.00 | 4.90 | 0.00 | 0.00 | true | 0.050332;0.049104;0.049153;0.048651;0.049166 | 0;0;0;0;0 | 960;1120;2592;1216;1120 | 224;96;3168;96;256 | |
318 | resnetv23_stage4__plus0 | elemwise_add | [256,2048,7,7] | 6509.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 372.33 | 25690112 | 205501472.00 | 102430005.33 | 93.70 | 0.08 | 69.00 | true | 0.937285;0.936982;0.937380;0.937374;0.937250 | 25690112;25690112;25690112;25690112;25690112 | 102055168;102625920;102608928;102667872;101913664 | 205492704;205563136;205503904;205495488;205505024 | |
319 | resnetv23_stage4_batchnorm3_fwd | BatchNorm | [256,2048,7,7] | 25386.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 862.33 | 179830784 | 7916554.67 | 17347274.67 | 21.70 | 7.12 | 208.54 | true | 0.216945;0.235489;0.225282;0.202351;0.207567 | 179830784;179830784;179830784;179830784;179830784 | 5958016;8523360;8781856;7266720;7959584 | 13000256;18676416;19172160;15919968;17445440 | |
320 | resnetv23_stage4_activation3 | Activation | [256,2048,7,7] | 5425.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.33 | 51380224 | 102760800.00 | 102727477.33 | 97.50 | 0.25 | 190.06 | true | 0.975215;0.974416;0.975588;0.975315;0.975051 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102733312;102721056;102745856;102726048;102723072 | |
321 | resnetv23_stage4_conv4_fwd | Convolution | [256,2048,7,7] | 1765703.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2481.00 | 26319519744 | 125589696.00 | 16771413.33 | 24.10 | 184.88 | 10608.43 | false | 0.240072;0.240428;0.241694;0.241049;0.242059 | 26319519744;26319519744;26319519744;26319519744;26319519744 | 125274944;125265600;125907360;125755776;125738368 | 16538176;16806752;16621408;16972608;16886080 | |
321 | resnetv23_stage4_conv4_fwd | Convolution | [256,2048,7,7] | 1765703.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 96.00 | 1365.33 | 4.90 | 0.00 | 0.00 | true | 0.048555;0.049130;0.049064;0.048748;0.049120 | 0;0;0;0;0 | 96;96;96;96;96 | 1280;1536;2304;1280;1280 | |
322 | resnetv23_stage4_batchnorm4_fwd | BatchNorm | [256,512,7,7] | 6410.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 219.00 | 44957696 | 25724864.00 | 25844032.00 | 18.50 | 0.87 | 205.29 | true | 0.188296;0.181087;0.183920;0.187287;0.183896 | 44957696;44957696;44957696;44957696;44957696 | 25735328;25752448;25791616;25679296;25686816 | 25748640;25905056;26189888;25832768;25794272 | |
323 | resnetv23_stage4_activation4 | Activation | [256,512,7,7] | 1358.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.33 | 12845056 | 25690464.00 | 25703978.67 | 94.50 | 0.25 | 185.27 | true | 0.944582;0.945128;0.943663;0.946747;0.945967 | 12845056;12845056;12845056;12845056;12845056 | 25690464;25690464;25690464;25690464;25690464 | 25697120;25709216;25699488;25707392;25705056 | |
324 | resnetv23_stage4_conv5_fwd | Convolution | [256,512,7,7] | 4136646 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1632.00 | 19346227200 | 58539904.00 | 42522698.67 | 24.70 | 191.43 | 11854.31 | false | 0.247012;0.246615;0.247155;0.246905;0.247381 | 19346227200;19346227200;19346227200;19346227200;19346227200 | 68125344;54852800;60771040;59433568;55415104 | 37444096;44123424;44302208;44306432;39142464 | |
324 | resnetv23_stage4_conv5_fwd | Convolution | [256,512,7,7] | 4136646 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 138.67 | 125829120 | 25388554.67 | 75259264.00 | 42.00 | 1.25 | 907.42 | true | 0.420317;0.420095;0.419242;0.420405;0.421870 | 125829120;125829120;125829120;125829120;125829120 | 25391712;25391584;25385184;25388256;25385824 | 75286240;75305664;75248864;75242688;75242688 | |
324 | resnetv23_stage4_conv5_fwd | Convolution | [256,512,7,7] | 4136646 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 135.67 | 433586176 | 75709045.33 | 28655776.00 | 47.60 | 4.15 | 3195.96 | true | 0.475617;0.476283;0.476419;0.475946;0.476123 | 433586176;433586176;433586176;433586176;433586176 | 75702784;75715040;75714464;75709888;75669568 | 28502304;28725472;28680576;28673920;28612832 | |
324 | resnetv23_stage4_conv5_fwd | Convolution | [256,512,7,7] | 4136646 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 64.67 | 21233664 | 9438976.00 | 37441546.67 | 77.20 | 0.45 | 328.35 | true | 0.760707;0.775677;0.771361;0.773078;0.770989 | 21233664;21233664;21233664;21233664;21233664 | 9438976;9438976;9439232;9438720;9438976 | 37439744;37384960;37441952;37450144;37442944 | |
325 | resnetv23_stage4_batchnorm5_fwd | BatchNorm | [256,512,7,7] | 6385 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 219.00 | 44957696 | 25414410.67 | 27593440.00 | 19.90 | 0.85 | 205.29 | true | 0.204089;0.203877;0.190231;0.198083;0.196443 | 44957696;44957696;44957696;44957696;44957696 | 25408512;25413952;25415968;25424576;25413312 | 27617792;27562880;27587968;27574560;27667360 | |
326 | resnetv23_stage4_activation5 | Activation | [256,512,7,7] | 1371 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.67 | 12845056 | 25690464.00 | 25682944.00 | 94.60 | 0.25 | 184.38 | true | 0.947172;0.945962;0.945574;0.945352;0.945520 | 12845056;12845056;12845056;12845056;12845056 | 25690464;25693024;25690464;25690464;25690464 | 25680928;25699872;25685408;25682496;25679648 | |
327 | resnetv23_stage4_conv6_fwd | Convolution | [256,512,7,7] | 1715791 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2175.67 | 26358054912 | 103649376.00 | 79390474.67 | 24.90 | 144.00 | 12114.93 | false | 0.249320;0.249094;0.249245;0.249332;0.249207 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 103516768;107066176;104055424;103375936;101246528 | 79379200;79282016;79510208;80742784;78774048 | |
327 | resnetv23_stage4_conv6_fwd | Convolution | [256,512,7,7] | 1715791 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.33 | 0 | 96.00 | 1450.67 | 4.90 | 0.00 | 0.00 | true | 0.048587;0.049124;0.050839;0.048675;0.049051 | 0;0;0;0;0 | 1536;1024;6656;1536;1280 | 96;96;96;96;96 | |
328 | resnetv23_stage4__plus1 | elemwise_add | [256,2048,7,7] | 6631.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 373.00 | 25690112 | 205528885.33 | 102501461.33 | 93.70 | 0.08 | 68.87 | true | 0.936808;0.937004;0.937386;0.937461;0.937277 | 25690112;25690112;25690112;25690112;25690112 | 205478688;205572448;205516864;205545472;205524320 | 102399328;102429440;102528096;102546848;102585216 | |
329 | resnetv23_stage4_batchnorm6_fwd | BatchNorm | [256,2048,7,7] | 25208 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 862.00 | 179830784 | 8032949.33 | 17563104.00 | 21.00 | 7.03 | 208.62 | true | 0.217589;0.206739;0.202364;0.216436;0.206783 | 179830784;179830784;179830784;179830784;179830784 | 7832288;8138784;8280992;8102848;7857216 | 17144192;17780416;18137728;17704224;17204672 | |
330 | resnetv23_stage4_activation6 | Activation | [256,2048,7,7] | 5519 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 268.67 | 51380224 | 102760800.00 | 102705386.67 | 97.60 | 0.25 | 191.24 | true | 0.976065;0.976624;0.974735;0.973864;0.975870 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102701600;102738304;102699488;102706144;102708416 | |
331 | resnetv23_stage4_conv7_fwd | Convolution | [256,2048,7,7] | 1762880.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2480.67 | 26319519744 | 128255594.67 | 16504693.33 | 24.10 | 181.81 | 10609.86 | false | 0.240409;0.242004;0.241137;0.241511;0.241063 | 26319519744;26319519744;26319519744;26319519744;26319519744 | 126788320;131078560;134290752;123398336;126899904 | 16182240;16152064;16841792;16780352;16551488 | |
331 | resnetv23_stage4_conv7_fwd | Convolution | [256,2048,7,7] | 1762880.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 1280.00 | 4.90 | 0.00 | 0.00 | true | 0.048664;0.049087;0.049166;0.050425;0.049157 | 0;0;0;0;0 | 96;96;96;96;352 | 1280;1280;1280;1280;2048 | |
332 | resnetv23_stage4_batchnorm7_fwd | BatchNorm | [256,512,7,7] | 6416.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 219.00 | 44957696 | 25699349.33 | 26146485.33 | 19.10 | 0.87 | 205.29 | true | 0.193120;0.194337;0.181145;0.185121;0.200893 | 44957696;44957696;44957696;44957696;44957696 | 26557536;26888832;25945504;25845920;25936416 | 25634624;25739488;25723200;25749440;25635360 | |
333 | resnetv23_stage4_activation7 | Activation | [256,512,7,7] | 1364 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.67 | 12845056 | 25690464.00 | 25703765.33 | 94.50 | 0.25 | 184.38 | true | 0.946562;0.943989;0.946798;0.944856;0.942948 | 12845056;12845056;12845056;12845056;12845056 | 25700576;25703776;25706944;25697888;25710432 | 25690464;25690464;25690464;25690464;25690464 | |
334 | resnetv23_stage4_conv8_fwd | Convolution | [256,512,7,7] | 4137058.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 1645.33 | 19346227200 | 56858826.67 | 40881674.67 | 24.70 | 197.93 | 11758.24 | false | 0.246616;0.247238;0.247794;0.247101;0.246785 | 19346227200;19346227200;19346227200;19346227200;19346227200 | 41787616;37471488;36910752;43385920;43951840 | 57762848;61642752;56546432;54723552;56267200 | |
334 | resnetv23_stage4_conv8_fwd | Convolution | [256,512,7,7] | 4137058.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 137.00 | 125829120 | 25374133.33 | 75300992.00 | 41.90 | 1.25 | 918.46 | true | 0.419087;0.417400;0.418799;0.419944;0.420065 | 125829120;125829120;125829120;125829120;125829120 | 25371104;25373152;25374112;25378272;25375136 | 75317952;75311072;75300032;75291872;75218240 | |
334 | resnetv23_stage4_conv8_fwd | Convolution | [256,512,7,7] | 4137058.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 134.67 | 433586176 | 75680149.33 | 28764896.00 | 47.60 | 4.15 | 3219.69 | true | 0.476491;0.475848;0.475496;0.475519;0.475385 | 433586176;433586176;433586176;433586176;433586176 | 75682400;75663488;75675328;75700320;75682720 | 28801888;28564864;28953600;28882304;28610496 | |
334 | resnetv23_stage4_conv8_fwd | Convolution | [256,512,7,7] | 4137058.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 67.67 | 21233664 | 9438037.33 | 37402944.00 | 77.10 | 0.45 | 313.80 | true | 0.774316;0.765613;0.772278;0.757032;0.774269 | 21233664;21233664;21233664;21233664;21233664 | 9437440;9438208;9437952;9438208;9437952 | 37402656;37346080;37408256;37397920;37431936 | |
335 | resnetv23_stage4_batchnorm8_fwd | BatchNorm | [256,512,7,7] | 6312.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 219.00 | 44957696 | 25417066.67 | 27589984.00 | 19.90 | 0.85 | 205.29 | true | 0.197036;0.192829;0.211590;0.192269;0.208052 | 44957696;44957696;44957696;44957696;44957696 | 25407232;25416128;25418592;25416480;25421856 | 27604640;27588960;27684800;27541760;27576352 | |
336 | resnetv23_stage4_activation8 | Activation | [256,512,7,7] | 1342.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 69.33 | 12845056 | 25690464.00 | 25681002.67 | 94.50 | 0.25 | 185.27 | true | 0.945857;0.944738;0.944939;0.946897;0.944391 | 12845056;12845056;12845056;12845056;12845056 | 25690464;25690464;25690464;25690464;25690464 | 25687136;25673792;25667040;25683040;25686176 | |
337 | resnetv23_stage4_conv9_fwd | Convolution | [256,512,7,7] | 1723032 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 2165.67 | 26358054912 | 102885429.33 | 79526005.33 | 24.90 | 144.50 | 12170.87 | false | 0.249264;0.249326;0.249299;0.249125;0.248975 | 26358054912;26358054912;26358054912;26358054912;26358054912 | 101884672;100762912;113716384;105146816;101624800 | 81873632;79640288;79184608;79737344;79200384 | |
337 | resnetv23_stage4_conv9_fwd | Convolution | [256,512,7,7] | 1723032 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.67 | 0 | 96.00 | 1280.00 | 4.90 | 0.00 | 0.00 | true | 0.048664;0.049041;0.049180;0.048592;0.049189 | 0;0;0;0;0 | 96;96;96;96;96 | 1280;1280;1536;1280;1280 | |
338 | resnetv23_stage4__plus2 | elemwise_add | [256,2048,7,7] | 6557.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::plus, 1>, float*, float*, float*>(int, float*, float*, float*) | 372.67 | 25690112 | 205529589.33 | 102417632.00 | 93.70 | 0.08 | 68.94 | true | 0.937360;0.937250;0.937659;0.937138;0.936764 | 25690112;25690112;25690112;25690112;25690112 | 205525120;205545728;205517472;205517920;205570080 | 102377952;102455040;102419904;102310208;102547264 | |
339 | resnetv23_batchnorm2_fwd | BatchNorm | [256,2048,7,7] | 25233.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 862.33 | 179830784 | 8261429.33 | 18063381.33 | 21.70 | 6.83 | 208.54 | true | 0.222178;0.220633;0.205822;0.217141;0.213596 | 179830784;179830784;179830784;179830784;179830784 | 18643808;19254656;18022016;12505600;17524320 | 8534656;8794624;8240000;5709568;8009632 | |
340 | resnetv23_relu1_fwd | Activation | [256,2048,7,7] | 5436.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 270.00 | 51380224 | 102760800.00 | 102726688.00 | 97.50 | 0.25 | 190.30 | true | 0.975212;0.975649;0.974730;0.974850;0.975401 | 51380224;51380224;51380224;51380224;51380224 | 102760800;102760800;102760800;102760800;102760800 | 102730144;102725536;102728704;102725824;102722912 | |
341 | resnetv23_pool1_fwd | Pooling | [256,2048,7,7] | 33830.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 238.33 | 37017088 | 176778400.00 | 5639744.00 | 60.50 | 0.20 | 155.32 | true | 0.605085;0.604603;0.605283;0.604858;0.605051 | 37017088;37017088;37017088;37017088;37017088 | 5631904;5673440;5649312;5638016;5625120 | 176189280;176723296;177030592;176581312;177517408 | |
343 | resnetv23_dense0_fwd | FullyConnected | [256,2048] | 58053.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x32_sliced1x4_tn | 138.67 | 1077936128 | 10723221.33 | 208629.33 | 12.50 | 98.61 | 7773.56 | false | 0.124608;0.124602;0.124602;0.124603;0.124602 | 1077936128;1077936128;1077936128;1077936128;1077936128 | 10725824;10728352;10717952;10713536;10725888 | 199776;209568;208096;217824;208224 | |
343 | resnetv23_dense0_fwd | FullyConnected | [256,2048] | 58053.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 6.00 | 256000 | 5760.00 | 0.00 | 59.90 | 44.44 | 42.67 | false | 0.608066;0.596282;0.592253;0.603527;0.597546 | 256000;256000;256000;256000;256000 | 5760;5760;5760;5760;5760 | 0;0;1152;0;0 |
Showing 1 to 503 of 503 entries