GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | densenet1_conv0_fwd | Convolution | [1,3,224,224] | 22911 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 56.67 | 367288320 | 267136.00 | 4271989.33 | 30.10 | 80.92 | 6481.52 | false | 0.298736;0.301996;0.297346;0.302020;0.303917 | 367288320;367288320;367288320;367288320;367288320 | 265984;331136;250880;284544;234240 | 4263584;4324512;4278208;4274176;4252256 | |
1 | densenet1_batchnorm0_fwd | BatchNorm | [1,96,112,112] | 371 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 20.00 | 7274496 | 1439477.33 | 6826741.33 | 29.00 | 0.88 | 363.72 | true | 0.289397;0.291054;0.290501;0.289984;0.290431 | 7274496;7274496;7274496;7274496;7274496 | 1420448;1473184;1463200;1431840;1423392 | 6855040;6800896;6814880;6813664;6851680 | |
2 | densenet1_relu0_fwd | Activation | [1,96,112,112] | 331.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 12.00 | 2408448 | 2140960.00 | 3303253.33 | 86.30 | 0.44 | 200.70 | true | 0.863767;0.863029;0.863196;0.855862;0.863617 | 2408448;2408448;2408448;2408448;2408448 | 2135840;2161056;2142624;2131360;2144416 | 3296064;3334688;3305728;3303744;3300288 | |
3 | densenet1_pool0_fwd | Pooling | [1,96,112,112] | 4230 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 11.33 | 301056 | 5312.00 | 1599797.33 | 60.80 | 0.19 | 26.56 | true | 0.605440;0.613270;0.609804;0.602845;0.608223 | 301056;301056;301056;301056;301056 | 6144;4416;5376;4416;8256 | 1600864;1598816;1599712;1601664;1598720 | |
4 | densenet1_stage1_batchnorm0_fwd | BatchNorm | [1,96,56,56] | 239 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 1855488 | 1536.00 | 919498.67 | 27.40 | 2.01 | 327.42 | true | 0.273746;0.273451;0.276170;0.274108;0.273603 | 1855488;1855488;1855488;1855488;1855488 | 1792;1536;1536;1536;1536 | 919680;922496;917184;918400;920416 | |
5 | densenet1_stage1_relu0_fwd | Activation | [1,96,56,56] | 87 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 602112 | 224.00 | 280650.67 | 61.00 | 2.14 | 120.42 | true | 0.614868;0.605757;0.606715;0.612281;0.610045 | 602112;602112;602112;602112;602112 | 224;224;224;224;224 | 274176;280256;287264;282208;279488 | |
6 | densenet1_stage1_conv0_fwd | Convolution | [1,96,56,56] | 6736 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 116207616 | 180842.67 | 2239029.33 | 17.10 | 48.02 | 4648.30 | false | 0.169570;0.172410;0.171586;0.171420;0.171094 | 116207616;116207616;116207616;116207616;116207616 | 164160;202176;180416;195904;166208 | 2216448;2254496;2240992;2246496;2229600 | |
7 | densenet1_stage1_batchnorm1_fwd | BatchNorm | [1,192,56,56] | 170.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 3710976 | 3093.33 | 2010538.67 | 52.60 | 1.84 | 618.50 | true | 0.525489;0.526198;0.525242;0.523647;0.526457 | 3710976;3710976;3710976;3710976;3710976 | 3072;3136;4096;3072;3072 | 2000544;2040320;2004992;2026080;1990464 | |
8 | densenet1_stage1_relu1_fwd | Activation | [1,192,56,56] | 175 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1204224 | 192.00 | 741162.67 | 72.90 | 1.62 | 200.70 | true | 0.728258;0.728546;0.726943;0.730440;0.731212 | 1204224;1204224;1204224;1204224;1204224 | 224;224;128;128;224 | 745248;733120;758112;739520;738720 | |
9 | densenet1_stage1_conv1_fwd | Convolution | [1,192,56,56] | 38351.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 52.33 | 368492544 | 10325.33 | 1016010.67 | 12.50 | 359.04 | 7041.30 | false | 0.124918;0.124913;0.124916;0.124917;0.124914 | 368492544;368492544;368492544;368492544;368492544 | 6656;10240;10496;10240;10496 | 1009216;1016896;1015616;1016000;1016416 | |
9 | densenet1_stage1_conv1_fwd | Convolution | [1,192,56,56] | 38351.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 333141.33 | 455061.33 | 7.50 | 0.90 | 142.54 | true | 0.075462;0.075133;0.075256;0.075161;0.075261 | 712704;712704;712704;712704;712704 | 337664;333056;333056;333312;333056 | 457248;455200;457120;450304;452864 | |
10 | densenet1_stage1_concat0 | Concat | [1,96,56,56] | 125 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.14 | 0 | 482432.00 | 747712.00 | 57.40 | 0.00 | 0.00 | true | 0.654057;0.482857;0.656157;0.484927;0.650028;0.511763;0.653990;0.487636;0.656457;0.482771 | 0;0;0;0;0;0;0;0;0;0 | 963072;0;972800;0;968320;0;968320;0;963200;0 | 988000;508288;1016928;500352;994816;504800;993248;504384;987552;468352 | |
10 | densenet1_stage1_concat0 | Concat | [1,96,56,56] | 125 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.86 | 0 | 482432.00 | 747712.00 | 57.40 | 0.00 | 0.00 | true | 0.654057;0.482857;0.656157;0.484927;0.650028;0.511763;0.653990;0.487636;0.656457;0.482771 | 0;0;0;0;0;0;0;0;0;0 | 963072;0;972800;0;968320;0;968320;0;963200;0 | 988000;508288;1016928;500352;994816;504800;993248;504384;987552;468352 | |
11 | densenet1_stage1_batchnorm2_fwd | BatchNorm | [1,144,56,56] | 341.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 2783232 | 2400.00 | 1206613.33 | 40.20 | 2.30 | 463.87 | true | 0.402825;0.402704;0.401688;0.401985;0.401461 | 2783232;2783232;2783232;2783232;2783232 | 1198720;1206560;1234176;1200640;1212640 | 2400;2400;2400;2400;12896 | |
12 | densenet1_stage1_relu2_fwd | Activation | [1,144,56,56] | 126.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.33 | 903168 | 85.33 | 376768.00 | 69.20 | 2.40 | 169.35 | true | 0.691053;0.690983;0.692297;0.692822;0.692318 | 903168;903168;903168;903168;903168 | 393568;369952;365152;390400;369952 | 128;128;0;0;128 | |
13 | densenet1_stage1_conv2_fwd | Convolution | [1,144,56,56] | 9545 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 32.00 | 174010368 | 124885.33 | 2129898.67 | 17.10 | 77.17 | 5437.82 | false | 0.170444;0.170601;0.172035;0.170876;0.171035 | 174010368;174010368;174010368;174010368;174010368 | 127680;123904;128640;122240;123072 | 2125216;2139168;2135296;2129184;2107104 | |
14 | densenet1_stage1_batchnorm3_fwd | BatchNorm | [1,192,56,56] | 179.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 3710976 | 3072.00 | 1750720.00 | 52.50 | 2.12 | 585.97 | true | 0.524523;0.525433;0.526033;0.524763;0.525631 | 3710976;3710976;3710976;3710976;3710976 | 3072;3072;3072;3072;3072 | 1723904;1779808;1727392;1785536;1744960 | |
15 | densenet1_stage1_relu3_fwd | Activation | [1,192,56,56] | 174 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1204224 | 224.00 | 716554.67 | 72.90 | 1.68 | 200.70 | true | 0.730187;0.729015;0.730581;0.728663;0.723316 | 1204224;1204224;1204224;1204224;1204224 | 726624;712480;721120;716064;710208 | 224;224;224;224;224 | |
16 | densenet1_stage1_conv3_fwd | Convolution | [1,192,56,56] | 37701.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.33 | 368492544 | 298.67 | 995210.67 | 12.50 | 370.15 | 7321.09 | false | 0.124917;0.124918;0.124918;0.124918;0.124913 | 368492544;368492544;368492544;368492544;368492544 | 256;320;320;5696;256 | 995936;995360;994944;980896;995328 | |
16 | densenet1_stage1_conv3_fwd | Convolution | [1,192,56,56] | 37701.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 463146.67 | 7.50 | 0.90 | 142.54 | true | 0.075214;0.075278;0.075479;0.075301;0.075415 | 712704;712704;712704;712704;712704 | 458464;457504;463808;479328;467168 | 331776;331776;331776;338432;331776 | |
17 | densenet1_stage1_concat1 | Concat | [1,144,56,56] | 137.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.00 | 0 | 874293.33 | 922410.67 | 60.20 | 0.00 | 0.00 | true | 0.717810;0.510639;0.712043;0.483788;0.711606;0.482788;0.712230;0.481792;0.715211;0.484473 | 0;0;0;0;0;0;0;0;0;0 | 1430144;416032;1483936;402016;1438816;416640;1461536;418592;1413376;416896 | 1750656;64;1751424;160;1742976;0;1750144;1792;1750528;160 | |
17 | densenet1_stage1_concat1 | Concat | [1,144,56,56] | 137.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.43 | 0 | 874293.33 | 922410.67 | 60.20 | 0.00 | 0.00 | true | 0.717810;0.510639;0.712043;0.483788;0.711606;0.482788;0.712230;0.481792;0.715211;0.484473 | 0;0;0;0;0;0;0;0;0;0 | 1750656;64;1751424;160;1742976;0;1750144;1792;1750528;160 | 1430144;416032;1483936;402016;1438816;416640;1461536;418592;1413376;416896 | |
18 | densenet1_stage1_batchnorm4_fwd | BatchNorm | [1,192,56,56] | 416.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 3710976 | 10570.67 | 2159168.00 | 52.80 | 1.71 | 585.97 | true | 0.527045;0.527172;0.529436;0.527753;0.528396 | 3710976;3710976;3710976;3710976;3710976 | 2154560;2152448;2181792;2168384;2154560 | 10080;11552;10080;9952;12000 | |
19 | densenet1_stage1_relu4_fwd | Activation | [1,192,56,56] | 126.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1204224 | 128.00 | 557664.00 | 72.70 | 2.16 | 200.70 | true | 0.725535;0.730171;0.726578;0.725867;0.728976 | 1204224;1204224;1204224;1204224;1204224 | 128;128;128;128;128 | 566592;562624;534144;547296;563072 | |
20 | densenet1_stage1_conv4_fwd | Convolution | [1,192,56,56] | 12304 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.33 | 231813120 | 173824.00 | 2235466.67 | 17.20 | 96.22 | 5893.60 | false | 0.170838;0.172105;0.170631;0.171991;0.173840 | 231813120;231813120;231813120;231813120;231813120 | 184704;165568;170624;176000;174848 | 2237152;2223584;2214976;2247232;2245664 | |
21 | densenet1_stage1_batchnorm5_fwd | BatchNorm | [1,192,56,56] | 160.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 3710976 | 3072.00 | 1746432.00 | 52.60 | 2.12 | 618.50 | true | 0.524742;0.526408;0.526718;0.526073;0.526776 | 3710976;3710976;3710976;3710976;3710976 | 1721376;1715552;1728928;1788992;1792256 | 4096;3072;3072;3072;3072 | |
22 | densenet1_stage1_relu5_fwd | Activation | [1,192,56,56] | 124.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1204224 | 224.00 | 625770.67 | 72.80 | 1.92 | 200.70 | true | 0.727083;0.728658;0.727976;0.742787;0.726545 | 1204224;1204224;1204224;1204224;1204224 | 224;224;224;224;224 | 619104;618368;624032;634176;635040 | |
23 | densenet1_stage1_conv5_fwd | Convolution | [1,192,56,56] | 37317.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.67 | 368492544 | 0.00 | 1011861.33 | 12.50 | 364.17 | 7272.83 | false | 0.124916;0.124915;0.124917;0.124917;0.124916 | 368492544;368492544;368492544;368492544;368492544 | 0;0;0;0;0 | 1013856;1007424;1007392;1016192;1014304 | |
23 | densenet1_stage1_conv5_fwd | Convolution | [1,192,56,56] | 37317.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 523061.33 | 7.50 | 0.83 | 142.54 | true | 0.075226;0.075227;0.075191;0.075309;0.075297 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 524544;538016;523584;521056;518496 | |
24 | densenet1_stage1_concat2 | Concat | [1,192,56,56] | 148.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.86 | 0 | 1195920.00 | 1211818.67 | 63.30 | 0.00 | 0.00 | true | 0.758660;0.507893;0.755068;0.508325;0.758499;0.513019;0.753409;0.509681;0.756062;0.507589 | 0;0;0;0;0;0;0;0;0;0 | 2364032;28576;2367360;28448;2358528;28960;2367872;23840;2362880;32544 | 2012320;409312;2015552;405216;2013408;413344;2001152;417216;2020992;413472 | |
24 | densenet1_stage1_concat2 | Concat | [1,192,56,56] | 148.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.14 | 0 | 1195920.00 | 1211818.67 | 63.30 | 0.00 | 0.00 | true | 0.758660;0.507893;0.755068;0.508325;0.758499;0.513019;0.753409;0.509681;0.756062;0.507589 | 0;0;0;0;0;0;0;0;0;0 | 2364032;28576;2367360;28448;2358528;28960;2367872;23840;2362880;32544 | 2012320;409312;2015552;405216;2013408;413344;2001152;417216;2020992;413472 | |
25 | densenet1_stage1_batchnorm6_fwd | BatchNorm | [1,240,56,56] | 515.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 9.00 | 4638720 | 388448.00 | 2802538.67 | 63.30 | 1.45 | 515.41 | true | 0.632599;0.626533;0.637332;0.631719;0.633784 | 4638720;4638720;4638720;4638720;4638720 | 386464;388832;390048;390560;385632 | 2802304;2800896;2804416;2827296;2796800 | |
26 | densenet1_stage1_relu6_fwd | Activation | [1,240,56,56] | 160.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.33 | 1505280 | 1184.00 | 1034506.67 | 75.20 | 1.45 | 237.69 | true | 0.753647;0.747954;0.755547;0.748192;0.759365 | 1505280;1505280;1505280;1505280;1505280 | 1632;544;6112;1376;544 | 1033152;1037056;1033312;1017216;1050240 | |
27 | densenet1_stage1_conv6_fwd | Convolution | [1,240,56,56] | 15487.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 48.00 | 289615872 | 227797.33 | 2617610.67 | 17.30 | 101.78 | 6033.66 | false | 0.171576;0.175433;0.169443;0.175300;0.171896 | 289615872;289615872;289615872;289615872;289615872 | 233088;225856;234048;224448;218944 | 2635744;2594368;2622720;2676128;2585984 | |
28 | densenet1_stage1_batchnorm7_fwd | BatchNorm | [1,192,56,56] | 169.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 3710976 | 3072.00 | 2146133.33 | 52.60 | 1.73 | 530.14 | true | 0.525097;0.524991;0.528071;0.525993;0.526565 | 3710976;3710976;3710976;3710976;3710976 | 3072;3072;3072;3072;3136 | 2161696;2173824;2138752;2137952;2131744 | |
29 | densenet1_stage1_relu7_fwd | Activation | [1,192,56,56] | 126 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1204224 | 224.00 | 588608.00 | 72.90 | 2.05 | 200.70 | true | 0.730463;0.729196;0.727217;0.730599;0.728170 | 1204224;1204224;1204224;1204224;1204224 | 224;224;224;2272;224 | 585408;582208;605920;589056;591360 | |
30 | densenet1_stage1_conv7_fwd | Convolution | [1,192,56,56] | 37400 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.67 | 368492544 | 0.00 | 1044618.67 | 12.50 | 352.75 | 7272.83 | false | 0.124914;0.124918;0.124913;0.124914;0.124917 | 368492544;368492544;368492544;368492544;368492544 | 0;0;0;0;0 | 1047232;1040800;1046112;1046944;1040768 | |
30 | densenet1_stage1_conv7_fwd | Convolution | [1,192,56,56] | 37400 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 509994.67 | 7.50 | 0.85 | 142.54 | true | 0.075275;0.075381;0.075300;0.075348;0.075183 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 507776;509312;512704;507968;513152 | |
31 | densenet1_stage1_concat3 | Concat | [1,240,56,56] | 189 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 8.71 | 0 | 1722789.33 | 1568709.33 | 65.10 | 0.00 | 0.00 | true | 0.768384;0.525620;0.771964;0.517847;0.772641;0.517490;0.772083;0.526392;0.778146;0.542614 | 0;0;0;0;0;0;0;0;0;0 | 2465824;668608;2460576;702880;2454880;673408;2442464;669472;2515840;678048 | 3010560;422688;3010560;440096;3010816;433696;3010560;430752;3010560;431264 | |
31 | densenet1_stage1_concat3 | Concat | [1,240,56,56] | 189 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.86 | 0 | 1722789.33 | 1568709.33 | 65.10 | 0.00 | 0.00 | true | 0.768384;0.525620;0.771964;0.517847;0.772641;0.517490;0.772083;0.526392;0.778146;0.542614 | 0;0;0;0;0;0;0;0;0;0 | 3010560;422688;3010560;440096;3010816;433696;3010560;430752;3010560;431264 | 2465824;668608;2460576;702880;2454880;673408;2442464;669472;2515840;678048 | |
32 | densenet1_stage1_batchnorm8_fwd | BatchNorm | [1,288,56,56] | 515 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 11.00 | 5566464 | 1189130.67 | 3441653.33 | 74.30 | 1.20 | 506.04 | true | 0.741787;0.746760;0.741335;0.733412;0.745902 | 5566464;5566464;5566464;5566464;5566464 | 3447360;3421696;3442048;3482336;3435552 | 1203296;1210336;1174752;1188960;1175136 | |
33 | densenet1_stage1_relu8_fwd | Activation | [1,288,56,56] | 183 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 8.00 | 1806336 | 277792.00 | 1832042.67 | 81.10 | 0.86 | 225.79 | true | 0.811820;0.814118;0.809830;0.812007;0.802524 | 1806336;1806336;1806336;1806336;1806336 | 280224;282400;262944;275872;277280 | 1855040;1855712;1825952;1815136;1814048 | |
34 | densenet1_stage1_conv8_fwd | Convolution | [1,288,56,56] | 18458.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 55.00 | 347418624 | 252714.67 | 2800800.00 | 17.40 | 113.78 | 6316.70 | false | 0.171620;0.177190;0.171827;0.177544;0.172230 | 347418624;347418624;347418624;347418624;347418624 | 254336;262272;253696;250112;245120 | 2807392;2768000;2853088;2798400;2796608 | |
35 | densenet1_stage1_batchnorm9_fwd | BatchNorm | [1,192,56,56] | 157.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.67 | 3710976 | 6954.67 | 2374005.33 | 52.10 | 1.56 | 484.02 | true | 0.529447;0.519852;0.522470;0.516137;0.520722 | 3710976;3710976;3710976;3710976;3710976 | 6528;8448;8704;5760;5888 | 2403104;2364768;2385216;2372032;2339936 | |
36 | densenet1_stage1_relu9_fwd | Activation | [1,192,56,56] | 125.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1204224 | 224.00 | 587925.33 | 72.90 | 2.05 | 200.70 | true | 0.730498;0.730363;0.733892;0.724033;0.726418 | 1204224;1204224;1204224;1204224;1204224 | 224;224;224;224;224 | 580544;580704;589280;593792;596256 | |
37 | densenet1_stage1_conv9_fwd | Convolution | [1,192,56,56] | 37542.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 51.00 | 368492544 | 0.00 | 1019477.33 | 12.50 | 361.45 | 7225.34 | false | 0.124917;0.124915;0.124914;0.124916;0.124915 | 368492544;368492544;368492544;368492544;368492544 | 1024160;1021056;1019040;1008352;1018336 | 0;0;0;0;0 | |
37 | densenet1_stage1_conv9_fwd | Convolution | [1,192,56,56] | 37542.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 478624.00 | 7.50 | 0.88 | 142.54 | true | 0.075178;0.075372;0.075184;0.075180;0.074835 | 712704;712704;712704;712704;712704 | 475648;476896;470464;492288;483328 | 331776;331776;331776;331776;331776 | |
38 | densenet1_stage1_concat4 | Concat | [1,288,56,56] | 217.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.00 | 0 | 2098661.33 | 1796437.33 | 65.20 | 0.00 | 0.00 | true | 0.787685;0.507801;0.787736;0.509037;0.790293;0.507304;0.788668;0.518374;0.789533;0.520313 | 0;0;0;0;0;0;0;0;0;0 | 3612672;582816;3612672;584480;3612736;585376;3612672;584096;3612672;580640 | 2970784;595424;3012576;608256;2978080;602560;2990080;615136;3001792;616288 | |
38 | densenet1_stage1_concat4 | Concat | [1,288,56,56] | 217.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.00 | 0 | 2098661.33 | 1796437.33 | 65.20 | 0.00 | 0.00 | true | 0.787685;0.507801;0.787736;0.509037;0.790293;0.507304;0.788668;0.518374;0.789533;0.520313 | 0;0;0;0;0;0;0;0;0;0 | 3612672;582816;3612672;584480;3612736;585376;3612672;584096;3612672;580640 | 2970784;595424;3012576;608256;2978080;602560;2990080;615136;3001792;616288 | |
39 | densenet1_stage1_batchnorm10_fwd | BatchNorm | [1,336,56,56] | 545 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 15.67 | 6494208 | 2320992.00 | 4182549.33 | 81.90 | 1.00 | 414.52 | true | 0.813052;0.834818;0.821817;0.815974;0.819585 | 6494208;6494208;6494208;6494208;6494208 | 2330720;2318048;2290400;2315872;2329056 | 4214624;4178144;4184288;4184416;4178944 | |
40 | densenet1_stage1_relu10_fwd | Activation | [1,336,56,56] | 226.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 9.67 | 2107392 | 886474.67 | 2742762.67 | 83.50 | 0.58 | 218.00 | true | 0.837517;0.833191;0.826741;0.844540;0.835681 | 2107392;2107392;2107392;2107392;2107392 | 889760;898464;883232;885920;883744 | 2752192;2768128;2711232;2747712;2728384 | |
41 | densenet1_stage1_conv10_fwd | Convolution | [1,336,56,56] | 21593.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 63.00 | 405221376 | 278784.00 | 2931989.33 | 17.60 | 126.21 | 6432.09 | false | 0.175018;0.176899;0.177262;0.176354;0.174164 | 405221376;405221376;405221376;405221376;405221376 | 278784;279808;293376;275008;277760 | 2913952;2933056;2953952;2907200;2948960 | |
42 | densenet1_stage1_batchnorm11_fwd | BatchNorm | [1,192,56,56] | 167 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 3710976 | 11253.33 | 2536458.67 | 52.10 | 1.46 | 463.87 | true | 0.517355;0.521199;0.522896;0.519394;0.521988 | 3710976;3710976;3710976;3710976;3710976 | 2526912;2573280;2518656;2539136;2543328 | 9120;12448;16544;10656;10656 | |
43 | densenet1_stage1_relu11_fwd | Activation | [1,192,56,56] | 130.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1204224 | 288.00 | 535616.00 | 73.30 | 2.25 | 200.70 | true | 0.733067;0.730661;0.733120;0.733440;0.733699 | 1204224;1204224;1204224;1204224;1204224 | 546272;537792;522784;555648;521504 | 288;288;288;5408;288 | |
44 | densenet1_stage1_conv11_fwd | Convolution | [1,192,56,56] | 38079.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.33 | 368492544 | 0.00 | 1005749.33 | 12.50 | 366.39 | 7321.09 | false | 0.124916;0.124913;0.124916;0.124915;0.124915 | 368492544;368492544;368492544;368492544;368492544 | 0;0;0;0;0 | 1004160;1007616;1006272;1006816;999200 | |
44 | densenet1_stage1_conv11_fwd | Convolution | [1,192,56,56] | 38079.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 472213.33 | 7.50 | 0.89 | 142.54 | true | 0.075342;0.075469;0.075207;0.075397;0.075413 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 467264;474816;473024;469472;474144 | |
45 | densenet1_stage1_concat5 | Concat | [1,336,56,56] | 287 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 10.43 | 0 | 2408517.33 | 2103114.67 | 66.40 | 0.00 | 0.00 | true | 0.796733;0.525099;0.799027;0.535321;0.798466;0.528983;0.796886;0.513997;0.797983;0.528356 | 0;0;0;0;0;0;0;0;0;0 | 4214848;602144;4214848;602144;4214848;602144;4216896;602272;4214848;602144 | 3580800;638144;3543936;646240;3586528;637856;3576064;636192;3570048;644256 | |
45 | densenet1_stage1_concat5 | Concat | [1,336,56,56] | 287 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 9.00 | 0 | 2408517.33 | 2103114.67 | 66.40 | 0.00 | 0.00 | true | 0.796733;0.525099;0.799027;0.535321;0.798466;0.528983;0.796886;0.513997;0.797983;0.528356 | 0;0;0;0;0;0;0;0;0;0 | 4214848;602144;4214848;602144;4214848;602144;4216896;602272;4214848;602144 | 3580800;638144;3543936;646240;3586528;637856;3576064;636192;3570048;644256 | |
46 | densenet1_batchnorm1_fwd | BatchNorm | [1,384,56,56] | 655.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 17.00 | 7421952 | 3550517.33 | 4842090.67 | 80.10 | 0.88 | 436.59 | true | 0.795864;0.801453;0.777502;0.805451;0.822253 | 7421952;7421952;7421952;7421952;7421952 | 3504480;3557600;3570272;3532768;3561184 | 4842592;4841824;4842944;4840608;4841856 | |
47 | densenet1_relu1_fwd | Activation | [1,384,56,56] | 241.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11.00 | 2408448 | 1711520.00 | 3859242.67 | 88.60 | 0.43 | 218.95 | true | 0.887002;0.885222;0.882537;0.885052;0.887820 | 2408448;2408448;2408448;2408448;2408448 | 1711008;1705120;1719456;1696160;1718432 | 3845760;3861312;3870336;3846080;3874080 | |
48 | densenet1_conv1_fwd | Convolution | [1,384,56,56] | 24785.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 70.67 | 463024128 | 327936.00 | 3099722.67 | 17.60 | 135.08 | 6552.20 | false | 0.175271;0.175094;0.176792;0.171872;0.182843 | 463024128;463024128;463024128;463024128;463024128 | 3056032;3152256;3202080;3079264;3067648 | 329472;336128;328704;314368;325632 | |
49 | densenet1_pool1_fwd | Pooling | [1,192,56,56] | 1622.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.67 | 3161088 | 6058.67 | 858154.67 | 40.20 | 3.66 | 412.30 | true | 0.402814;0.403485;0.399062;0.404480;0.398705 | 3161088;3161088;3161088;3161088;3161088 | 867296;845408;852576;866912;854976 | 3072;8192;8320;6400;3584 | |
50 | densenet1_stage2_batchnorm0_fwd | BatchNorm | [1,192,28,28] | 87.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 3232.00 | 274165.33 | 26.00 | 3.43 | 190.46 | true | 0.260377;0.260108;0.260151;0.260518;0.260443 | 952320;952320;952320;952320;952320 | 5280;3232;3232;3232;3232 | 244000;309248;350560;269248;200000 | |
51 | densenet1_stage2_relu0_fwd | Activation | [1,192,28,28] | 41.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 301056 | 192.00 | 64533.33 | 57.70 | 4.65 | 60.21 | true | 0.576490;0.577820;0.570403;0.575651;0.578869 | 301056;301056;301056;301056;301056 | 53120;81728;97312;58752;50432 | 128;128;224;224;224 | |
52 | densenet1_stage2_conv0_fwd | Convolution | [1,192,28,28] | 3191 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 23.00 | 59132928 | 147498.67 | 585717.33 | 5.50 | 80.65 | 2571.00 | false | 0.055402;0.055667;0.054534;0.055219;0.055955 | 59132928;59132928;59132928;59132928;59132928 | 147456;147456;147520;147520;147520 | 558144;581408;549376;625024;617600 | |
53 | densenet1_stage2_batchnorm1_fwd | BatchNorm | [1,192,28,28] | 86.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 3072.00 | 135530.67 | 25.60 | 6.87 | 190.46 | true | 0.255216;0.255653;0.255356;0.255812;0.256249 | 952320;952320;952320;952320;952320 | 3072;3072;3072;3072;5120 | 150464;135616;141504;129472;125888 | |
54 | densenet1_stage2_relu1_fwd | Activation | [1,192,28,28] | 37.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 301056 | 224.00 | 448.00 | 57.70 | 448.00 | 64.51 | false | 0.575735;0.577384;0.577696;0.580925;0.575267 | 301056;301056;301056;301056;301056 | 224;1760;224;224;224 | 384;512;448;448;448 | |
55 | densenet1_stage2_conv1_fwd | Convolution | [1,192,28,28] | 9430.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 105283584 | 0.00 | 544170.67 | 12.50 | 193.48 | 2105.67 | false | 0.124902;0.124903;0.124899;0.124899;0.124901 | 105283584;105283584;105283584;105283584;105283584 | 545312;543712;543904;544288;544320 | 0;0;0;0;0 | |
55 | densenet1_stage2_conv1_fwd | Convolution | [1,192,28,28] | 9430.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 712704 | 331776.00 | 393706.67 | 7.50 | 0.98 | 152.71 | true | 0.075242;0.075160;0.075164;0.075128;0.075361 | 712704;712704;712704;712704;712704 | 393536;393824;393760;393920;393344 | 331776;331776;331776;331776;331776 | |
56 | densenet1_stage2_concat0 | Concat | [1,192,28,28] | 62 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.29 | 0 | 0.00 | 36768.00 | 35.30 | 0.00 | 0.00 | true | 0.499527;0.207076;0.500581;0.206868;0.499541;0.207158;0.498975;0.207087;0.500073;0.207001 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 256;73984;256;73664;384;75648;256;76160;384;71936 | |
56 | densenet1_stage2_concat0 | Concat | [1,192,28,28] | 62 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.14 | 0 | 0.00 | 36768.00 | 35.30 | 0.00 | 0.00 | true | 0.499527;0.207076;0.500581;0.206868;0.499541;0.207158;0.498975;0.207087;0.500073;0.207001 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 256;73984;256;73664;384;75648;256;76160;384;71936 | |
57 | densenet1_stage2_batchnorm2_fwd | BatchNorm | [1,240,28,28] | 63.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 1190400 | 3840.00 | 213770.67 | 30.80 | 5.47 | 238.08 | true | 0.307504;0.308036;0.307225;0.307837;0.308107 | 1190400;1190400;1190400;1190400;1190400 | 3840;3840;3840;3840;3840 | 212736;218336;215520;209632;213056 | |
58 | densenet1_stage2_relu2_fwd | Activation | [1,240,28,28] | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 376320 | 0.00 | 52501.33 | 50.90 | 7.17 | 75.26 | true | 0.508898;0.506406;0.509290;0.508558;0.508529 | 376320;376320;376320;376320;376320 | 0;0;0;0;0 | 52992;51360;52224;53472;52288 | |
59 | densenet1_stage2_conv2_fwd | Convolution | [1,240,28,28] | 4099.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 27.00 | 73878528 | 184320.00 | 296074.67 | 5.60 | 153.79 | 2736.24 | false | 0.055045;0.055402;0.055413;0.056296;0.055708 | 73878528;73878528;73878528;73878528;73878528 | 184320;184320;184320;184320;184320 | 296960;296576;294688;294112;298144 | |
60 | densenet1_stage2_batchnorm3_fwd | BatchNorm | [1,192,28,28] | 87.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 3072.00 | 426.67 | 25.60 | 272.20 | 190.46 | false | 0.255349;0.255866;0.255645;0.256163;0.256802 | 952320;952320;952320;952320;952320 | 3072;3072;3072;3072;3072 | 512;640;256;256;512 | |
61 | densenet1_stage2_relu3_fwd | Activation | [1,192,28,28] | 37.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 301056 | 0.00 | 384.00 | 53.80 | 784.00 | 69.48 | false | 0.536513;0.542496;0.533731;0.538311;0.539564 | 301056;301056;301056;301056;301056 | 0;0;0;0;0 | 384;384;512;384;384 | |
62 | densenet1_stage2_conv3_fwd | Convolution | [1,192,28,28] | 9413.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 105283584 | 0.00 | 286816.00 | 12.50 | 367.08 | 2119.79 | false | 0.124901;0.124904;0.124901;0.124906;0.124900 | 105283584;105283584;105283584;105283584;105283584 | 293984;281728;293184;278432;285536 | 0;0;0;0;0 | |
62 | densenet1_stage2_conv3_fwd | Convolution | [1,192,28,28] | 9413.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 201632.00 | 7.50 | 1.34 | 142.54 | true | 0.075182;0.075205;0.075394;0.075382;0.075236 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 197728;205856;197600;210144;201312 | |
63 | densenet1_stage2_concat1 | Concat | [1,240,28,28] | 72.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.29 | 0 | 0.00 | 94378.67 | 38.20 | 0.00 | 0.00 | true | 0.557285;0.207123;0.557462;0.207968;0.557715;0.207054;0.558978;0.206862;0.556213;0.207084 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 81440;111648;79200;106912;78720;110528;78400;111520;79232;108960 | |
63 | densenet1_stage2_concat1 | Concat | [1,240,28,28] | 72.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.14 | 0 | 0.00 | 94378.67 | 38.20 | 0.00 | 0.00 | true | 0.557285;0.207123;0.557462;0.207968;0.557715;0.207054;0.558978;0.206862;0.556213;0.207084 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 81440;111648;79200;106912;78720;110528;78400;111520;79232;108960 | |
64 | densenet1_stage2_batchnorm4_fwd | BatchNorm | [1,288,28,28] | 66.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 1428480 | 4778.67 | 490272.00 | 35.90 | 2.89 | 285.70 | true | 0.359983;0.358451;0.363853;0.359234;0.357934 | 1428480;1428480;1428480;1428480;1428480 | 4608;5120;4608;9728;4608 | 495616;478304;492128;486112;492576 | |
65 | densenet1_stage2_relu4_fwd | Activation | [1,288,28,28] | 67.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 451584 | 0.00 | 50133.33 | 53.70 | 9.01 | 104.22 | true | 0.541354;0.536054;0.535999;0.535773;0.538454 | 451584;451584;451584;451584;451584 | 0;0;0;0;0 | 53888;51968;47488;47872;50560 | |
66 | densenet1_stage2_conv4_fwd | Convolution | [1,288,28,28] | 4884 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 31.33 | 88624128 | 221184.00 | 366890.67 | 5.50 | 150.70 | 2828.46 | false | 0.055201;0.056095;0.055154;0.056439;0.054907 | 88624128;88624128;88624128;88624128;88624128 | 221184;221184;221184;223488;221184 | 375584;365472;361504;367264;367936 | |
67 | densenet1_stage2_batchnorm5_fwd | BatchNorm | [1,192,28,28] | 89.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 3072.00 | 981.33 | 25.60 | 234.95 | 190.46 | false | 0.256231;0.256359;0.255439;0.259460;0.254878 | 952320;952320;952320;952320;952320 | 3072;3072;3072;3072;3072 | 768;896;1280;1024;1024 | |
68 | densenet1_stage2_relu5_fwd | Activation | [1,192,28,28] | 49 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 301056 | 0.00 | 426.67 | 53.70 | 705.60 | 64.51 | false | 0.536856;0.537811;0.539028;0.537607;0.536019 | 301056;301056;301056;301056;301056 | 0;0;6912;0;0 | 512;384;384;512;384 | |
69 | densenet1_stage2_conv5_fwd | Convolution | [1,192,28,28] | 9471.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 105283584 | 0.00 | 399925.33 | 12.50 | 263.26 | 2105.67 | false | 0.124904;0.124902;0.124905;0.124900;0.124904 | 105283584;105283584;105283584;105283584;105283584 | 0;0;0;0;0 | 390656;399744;398400;401632;403584 | |
69 | densenet1_stage2_conv5_fwd | Convolution | [1,192,28,28] | 9471.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 248405.33 | 7.50 | 1.23 | 142.54 | true | 0.075295;0.075304;0.075251;0.075195;0.075189 | 712704;712704;712704;712704;712704 | 250272;245696;246912;248512;249792 | 331776;331776;332032;331776;331776 | |
70 | densenet1_stage2_concat2 | Concat | [1,288,28,28] | 72.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.86 | 0 | 0.00 | 47514.67 | 40.40 | 0.00 | 0.00 | true | 0.601044;0.206913;0.601047;0.206865;0.602019;0.207227;0.601551;0.207031;0.603035;0.206961 | 0;0;0;0;0;0;0;0;0;0 | 0;2304;0;0;0;0;0;0;0;0 | 8576;86880;8192;86816;8704;85888;8576;86528;8448;87840 | |
70 | densenet1_stage2_concat2 | Concat | [1,288,28,28] | 72.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.57 | 0 | 0.00 | 47514.67 | 40.40 | 0.00 | 0.00 | true | 0.601044;0.206913;0.601047;0.206865;0.602019;0.207227;0.601551;0.207031;0.603035;0.206961 | 0;0;0;0;0;0;0;0;0;0 | 0;2304;0;0;0;0;0;0;0;0 | 8576;86880;8192;86816;8704;85888;8576;86528;8448;87840 | |
71 | densenet1_stage2_batchnorm6_fwd | BatchNorm | [1,336,28,28] | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 1666560 | 5376.00 | 724277.33 | 41.00 | 2.28 | 294.08 | true | 0.409533;0.407940;0.409512;0.410360;0.415560 | 1666560;1666560;1666560;1666560;1666560 | 5376;5376;5376;5376;12032 | 716800;724384;710816;731648;738304 | |
72 | densenet1_stage2_relu6_fwd | Activation | [1,336,28,28] | 59.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 526848 | 0.00 | 183573.33 | 56.80 | 2.87 | 105.37 | true | 0.567840;0.567776;0.569409;0.569704;0.567246 | 526848;526848;526848;526848;526848 | 0;0;0;0;0 | 185760;185376;183328;182016;176544 | |
73 | densenet1_stage2_conv6_fwd | Convolution | [1,336,28,28] | 5492.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 35.00 | 103369728 | 258048.00 | 518634.67 | 5.50 | 133.09 | 2953.42 | false | 0.055144;0.054934;0.054607;0.056105;0.056431 | 103369728;103369728;103369728;103369728;103369728 | 258048;258048;258048;258048;258048 | 517632;506624;513888;530848;524384 | |
74 | densenet1_stage2_batchnorm7_fwd | BatchNorm | [1,192,28,28] | 89 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 4778.67 | 384.00 | 25.60 | 184.46 | 190.46 | false | 0.256231;0.255809;0.257214;0.255684;0.256004 | 952320;952320;952320;952320;952320 | 8192;13312;3072;3072;3072 | 384;512;256;384;384 | |
75 | densenet1_stage2_relu7_fwd | Activation | [1,192,28,28] | 36.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 301056 | 0.00 | 768.00 | 54.00 | 392.00 | 64.51 | false | 0.538849;0.542265;0.538896;0.537988;0.540881 | 301056;301056;301056;301056;301056 | 0;0;0;0;0 | 384;1408;1408;384;512 | |
76 | densenet1_stage2_conv7_fwd | Convolution | [1,192,28,28] | 9305 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 105283584 | 0.00 | 484800.00 | 12.50 | 217.17 | 2119.79 | false | 0.124900;0.124907;0.124901;0.124902;0.124905 | 105283584;105283584;105283584;105283584;105283584 | 0;0;0;0;0 | 478976;486912;481088;490432;486400 | |
76 | densenet1_stage2_conv7_fwd | Convolution | [1,192,28,28] | 9305 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 321461.33 | 7.50 | 1.09 | 142.54 | true | 0.075374;0.075219;0.075133;0.075265;0.075519 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 327232;318432;326400;317664;319552 | |
77 | densenet1_stage2_concat3 | Concat | [1,336,28,28] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.14 | 0 | 0.00 | 220853.33 | 41.90 | 0.00 | 0.00 | true | 0.631090;0.207130;0.631144;0.207120;0.629170;0.207782;0.631216;0.206912;0.633022;0.207166 | 0;0;0;0;0;0;0;0;0;0 | 233152;205824;236608;208832;233504;207200;239072;202656;242848;204480 | 0;0;0;0;0;0;0;0;0;0 | |
77 | densenet1_stage2_concat3 | Concat | [1,336,28,28] | 79.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.86 | 0 | 0.00 | 220853.33 | 41.90 | 0.00 | 0.00 | true | 0.631090;0.207130;0.631144;0.207120;0.629170;0.207782;0.631216;0.206912;0.633022;0.207166 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 233152;205824;236608;208832;233504;207200;239072;202656;242848;204480 | |
78 | densenet1_stage2_batchnorm8_fwd | BatchNorm | [1,384,28,28] | 70 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.33 | 1904640 | 6144.00 | 979168.00 | 45.30 | 1.93 | 357.14 | true | 0.446626;0.447350;0.456524;0.456359;0.455211 | 1904640;1904640;1904640;1904640;1904640 | 6144;6144;6144;7424;6144 | 977504;980544;980288;976576;979712 | |
79 | densenet1_stage2_relu8_fwd | Activation | [1,384,28,28] | 66.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 602112 | 0.00 | 78538.67 | 59.70 | 7.67 | 120.42 | true | 0.598875;0.597866;0.597405;0.596664;0.596296 | 602112;602112;602112;602112;602112 | 0;0;0;0;0 | 86528;76416;78560;76800;80256 | |
80 | densenet1_stage2_conv8_fwd | Convolution | [1,384,28,28] | 6287.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.00 | 118115328 | 294912.00 | 519541.33 | 5.60 | 145.02 | 3028.60 | false | 0.056009;0.055050;0.056181;0.054560;0.056696 | 118115328;118115328;118115328;118115328;118115328 | 294912;294912;294912;294912;294912 | 514912;516384;513792;528864;527328 | |
81 | densenet1_stage2_batchnorm9_fwd | BatchNorm | [1,192,28,28] | 86.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 952320 | 3072.00 | 426.67 | 25.60 | 272.20 | 219.78 | false | 0.258349;0.256186;0.255254;0.256589;0.256029 | 952320;952320;952320;952320;952320 | 3072;3072;3072;3072;3072 | 256;1152;256;256;768 | |
82 | densenet1_stage2_relu9_fwd | Activation | [1,192,28,28] | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 301056 | 597.33 | 426.67 | 55.20 | 294.00 | 60.21 | false | 0.576263;0.539391;0.577139;0.540607;0.535799 | 301056;301056;301056;301056;301056 | 384;512;1408;384;384 | 0;0;5120;0;1792 | |
83 | densenet1_stage2_conv9_fwd | Convolution | [1,192,28,28] | 9297 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 105283584 | 0.00 | 464416.00 | 12.50 | 226.70 | 2105.67 | false | 0.124903;0.124899;0.124902;0.124907;0.124900 | 105283584;105283584;105283584;105283584;105283584 | 6656;0;0;0;0 | 463904;463872;473536;465472;456960 | |
83 | densenet1_stage2_conv9_fwd | Convolution | [1,192,28,28] | 9297 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 296224.00 | 7.50 | 1.13 | 142.54 | true | 0.075241;0.075191;0.075084;0.075242;0.075114 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;336896 | 290496;297088;294880;296704;302400 | |
84 | densenet1_stage2_concat4 | Concat | [1,384,28,28] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 0.00 | 189621.33 | 43.10 | 0.00 | 0.00 | true | 0.654436;0.207017;0.655422;0.206814;0.653869;0.207183;0.655365;0.207704;0.656882;0.207001 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;512 | 234464;139936;245536;140384;241792;136832;241216;139936;241952;138016 | |
84 | densenet1_stage2_concat4 | Concat | [1,384,28,28] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.00 | 0 | 0.00 | 189621.33 | 43.10 | 0.00 | 0.00 | true | 0.654436;0.207017;0.655422;0.206814;0.653869;0.207183;0.655365;0.207704;0.656882;0.207001 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;512 | 234464;139936;245536;140384;241792;136832;241216;139936;241952;138016 | |
85 | densenet1_stage2_batchnorm10_fwd | BatchNorm | [1,432,28,28] | 91 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 2142720 | 6912.00 | 1251733.33 | 50.30 | 1.70 | 378.10 | true | 0.505677;0.498760;0.506173;0.500750;0.503865 | 2142720;2142720;2142720;2142720;2142720 | 6912;6912;6912;6912;6912 | 1243520;1258560;1249344;1255008;1250848 | |
86 | densenet1_stage2_relu10_fwd | Activation | [1,432,28,28] | 74 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 677376 | 0.00 | 165568.00 | 59.00 | 4.09 | 135.48 | true | 0.586247;0.586353;0.593742;0.591784;0.590566 | 677376;677376;677376;677376;677376 | 0;0;0;0;0 | 169280;165952;164288;162432;166464 | |
87 | densenet1_stage2_conv10_fwd | Convolution | [1,432,28,28] | 7063 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 44.00 | 132860928 | 331776.00 | 605493.33 | 5.50 | 141.75 | 3019.57 | false | 0.054715;0.056389;0.054886;0.056218;0.053741 | 132860928;132860928;132860928;132860928;132860928 | 605696;606208;604416;604928;605856 | 331776;331776;331776;331776;331776 | |
88 | densenet1_stage2_batchnorm11_fwd | BatchNorm | [1,192,28,28] | 90.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 3072.00 | 512.00 | 25.60 | 265.71 | 190.46 | false | 0.255122;0.255761;0.256307;0.256211;0.255396 | 952320;952320;952320;952320;952320 | 3072;3072;9984;3072;3072 | 640;512;512;512;384 | |
89 | densenet1_stage2_relu11_fwd | Activation | [1,192,28,28] | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 301056 | 0.00 | 384.00 | 55.30 | 784.00 | 60.21 | false | 0.597402;0.540278;0.575383;0.542439;0.540666 | 301056;301056;301056;301056;301056 | 384;384;896;384;384 | 0;0;256;0;0 | |
90 | densenet1_stage2_conv11_fwd | Convolution | [1,192,28,28] | 9284.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 105283584 | 0.00 | 542688.00 | 12.50 | 194.00 | 2119.79 | false | 0.124899;0.124903;0.124903;0.124903;0.124902 | 105283584;105283584;105283584;105283584;105283584 | 0;0;0;0;0 | 532192;543200;542624;542752;542688 | |
90 | densenet1_stage2_conv11_fwd | Convolution | [1,192,28,28] | 9284.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 393760.00 | 7.50 | 0.98 | 142.54 | true | 0.075085;0.075253;0.075250;0.075123;0.075213 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 393600;393504;393088;394176;394240 | |
91 | densenet1_stage2_concat5 | Concat | [1,432,28,28] | 93.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 341.33 | 376725.33 | 43.90 | 0.00 | 0.00 | true | 0.672894;0.207216;0.674761;0.207149;0.671152;0.207008;0.674026;0.206981;0.671297;0.207136 | 0;0;0;0;0;0;0;0;0;0 | 2560;0;1792;0;128;0;128;0;1792;0 | 565376;186944;564864;189824;563232;188608;564000;188608;564992;189824 | |
91 | densenet1_stage2_concat5 | Concat | [1,432,28,28] | 93.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.14 | 0 | 341.33 | 376725.33 | 43.90 | 0.00 | 0.00 | true | 0.672894;0.207216;0.674761;0.207149;0.671152;0.207008;0.674026;0.206981;0.671297;0.207136 | 0;0;0;0;0;0;0;0;0;0 | 565376;186944;564864;189824;563232;188608;564000;188608;564992;189824 | 2560;0;1792;0;128;0;128;0;1792;0 | |
92 | densenet1_stage2_batchnorm12_fwd | BatchNorm | [1,480,28,28] | 86 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 2380800 | 7680.00 | 1544330.67 | 53.70 | 1.53 | 420.12 | true | 0.538249;0.539848;0.536664;0.530045;0.535274 | 2380800;2380800;2380800;2380800;2380800 | 7680;7680;7680;7680;13056 | 1548032;1542912;1552096;1542048;1533920 | |
93 | densenet1_stage2_relu12_fwd | Activation | [1,480,28,28] | 105.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 752640 | 0.00 | 111925.33 | 62.10 | 6.72 | 150.53 | true | 0.627783;0.624224;0.622323;0.614159;0.617131 | 752640;752640;752640;752640;752640 | 0;0;0;0;0 | 112160;112032;111040;112128;111616 | |
94 | densenet1_stage2_conv12_fwd | Convolution | [1,480,28,28] | 7803.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 48.33 | 147606528 | 368640.00 | 605856.00 | 5.60 | 151.47 | 3053.95 | false | 0.055765;0.055686;0.055795;0.056257;0.055189 | 147606528;147606528;147606528;147606528;147606528 | 368640;368640;368640;368640;369152 | 606368;605472;606240;605216;605856 | |
95 | densenet1_stage2_batchnorm13_fwd | BatchNorm | [1,192,28,28] | 92.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 3072.00 | 426.67 | 25.60 | 272.20 | 190.46 | false | 0.256151;0.256028;0.255670;0.255867;0.255803 | 952320;952320;952320;952320;952320 | 3072;3072;3072;3072;3072 | 256;512;512;256;512 | |
96 | densenet1_stage2_relu13_fwd | Activation | [1,192,28,28] | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 301056 | 224.00 | 512.00 | 55.20 | 409.04 | 60.21 | false | 0.574122;0.538376;0.576817;0.542885;0.539384 | 301056;301056;301056;301056;301056 | 0;224;224;224;224 | 512;512;512;640;512 | |
97 | densenet1_stage2_conv13_fwd | Convolution | [1,192,28,28] | 9332.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 105283584 | 0.00 | 540544.00 | 12.50 | 194.77 | 2119.79 | false | 0.124904;0.124902;0.124901;0.124903;0.124901 | 105283584;105283584;105283584;105283584;105283584 | 0;0;0;0;0 | 543104;542976;532352;543104;535552 | |
97 | densenet1_stage2_conv13_fwd | Convolution | [1,192,28,28] | 9332.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 396266.67 | 7.50 | 0.98 | 142.54 | true | 0.075194;0.075281;0.075046;0.075208;0.075185 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;333824 | 393696;393984;402912;393696;401120 | |
98 | densenet1_stage2_concat6 | Concat | [1,480,28,28] | 104.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.86 | 0 | 10.67 | 270506.67 | 44.90 | 0.00 | 0.00 | true | 0.691663;0.207000;0.694707;0.206760;0.691120;0.207161;0.691192;0.207169;0.692189;0.206918 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;64;6656;5440;0;0 | 365408;180544;352000;177152;361600;184256;372256;185920;357760;181504 | |
98 | densenet1_stage2_concat6 | Concat | [1,480,28,28] | 104.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 10.67 | 270506.67 | 44.90 | 0.00 | 0.00 | true | 0.691663;0.207000;0.694707;0.206760;0.691120;0.207161;0.691192;0.207169;0.692189;0.206918 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;64;6656;5440;0;0 | 365408;180544;352000;177152;361600;184256;372256;185920;357760;181504 | |
99 | densenet1_stage2_batchnorm14_fwd | BatchNorm | [1,528,28,28] | 110 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 2618880 | 8490.67 | 1539584.00 | 57.30 | 1.69 | 436.48 | true | 0.576452;0.572600;0.569455;0.573106;0.574776 | 2618880;2618880;2618880;2618880;2618880 | 1541568;1545920;1537696;1526688;1539488 | 8512;8512;8448;8448;8512 | |
100 | densenet1_stage2_relu14_fwd | Activation | [1,528,28,28] | 125.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 827904 | 85.33 | 189301.33 | 67.10 | 4.37 | 165.58 | true | 0.669714;0.671508;0.675226;0.642509;0.671432 | 827904;827904;827904;827904;827904 | 224;128;0;0;128 | 186784;185664;189216;191904;194624 | |
101 | densenet1_stage2_conv14_fwd | Convolution | [1,528,28,28] | 8834.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 52.67 | 162352128 | 405504.00 | 605429.33 | 5.60 | 160.60 | 3082.62 | false | 0.055573;0.055307;0.056683;0.056260;0.056000 | 162352128;162352128;162352128;162352128;162352128 | 405504;405504;405504;405504;405504 | 605632;604416;604640;606016;606208 | |
102 | densenet1_stage2_batchnorm15_fwd | BatchNorm | [1,192,28,28] | 94.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 3072.00 | 1568.00 | 25.60 | 205.24 | 190.46 | false | 0.254965;0.255278;0.256421;0.255524;0.255859 | 952320;952320;952320;952320;952320 | 3072;3072;3072;3072;3072 | 2848;2080;1312;1312;672 | |
103 | densenet1_stage2_relu15_fwd | Activation | [1,192,28,28] | 37.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 301056 | 224.00 | 725.33 | 57.80 | 317.12 | 75.26 | false | 0.578135;0.576403;0.574879;0.589571;0.580829 | 301056;301056;301056;301056;301056 | 224;224;224;224;224 | 512;1152;512;512;1280 | |
104 | densenet1_stage2_conv15_fwd | Convolution | [1,192,28,28] | 9298.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.33 | 105283584 | 0.00 | 542869.33 | 12.50 | 193.94 | 2134.14 | false | 0.124901;0.124900;0.124902;0.124901;0.124902 | 105283584;105283584;105283584;105283584;105283584 | 0;0;0;0;0 | 529920;542848;543168;543296;542592 | |
104 | densenet1_stage2_conv15_fwd | Convolution | [1,192,28,28] | 9298.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 393504.00 | 7.50 | 0.98 | 142.54 | true | 0.074956;0.075343;0.074997;0.075212;0.075347 | 712704;712704;712704;712704;712704 | 332032;331776;331776;331776;331776 | 405440;393664;393152;393056;393696 | |
105 | densenet1_stage2_concat7 | Concat | [1,528,28,28] | 106.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.14 | 0 | 150122.67 | 527717.33 | 44.60 | 0.00 | 0.00 | true | 0.684600;0.207107;0.687871;0.207579;0.683784;0.207023;0.686977;0.207414;0.684430;0.206987 | 0;0;0;0;0;0;0;0;0;0 | 868224;186528;871680;183968;864576;190880;862528;191776;866048;190496 | 313984;0;294016;0;311424;0;310912;0;295808;0 | |
105 | densenet1_stage2_concat7 | Concat | [1,528,28,28] | 106.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 150122.67 | 527717.33 | 44.60 | 0.00 | 0.00 | true | 0.684600;0.207107;0.687871;0.207579;0.683784;0.207023;0.686977;0.207414;0.684430;0.206987 | 0;0;0;0;0;0;0;0;0;0 | 313984;0;294016;0;311424;0;310912;0;295808;0 | 868224;186528;871680;183968;864576;190880;862528;191776;866048;190496 | |
106 | densenet1_stage2_batchnorm16_fwd | BatchNorm | [1,576,28,28] | 97 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 2856960 | 9248.00 | 1882026.67 | 61.80 | 1.51 | 476.16 | true | 0.622946;0.620052;0.620708;0.612792;0.606123 | 2856960;2856960;2856960;2856960;2856960 | 9312;11264;9216;9216;9216 | 1884416;1882368;1882688;1881024;1880448 | |
107 | densenet1_stage2_relu16_fwd | Activation | [1,576,28,28] | 125 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 903168 | 42.67 | 137610.67 | 67.90 | 6.56 | 159.37 | true | 0.692904;0.669577;0.690061;0.668284;0.678838 | 903168;903168;903168;903168;903168 | 136224;135616;146176;135328;140992 | 0;128;0;0;128 | |
108 | densenet1_stage2_conv16_fwd | Convolution | [1,576,28,28] | 9593 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 56.33 | 177097728 | 442368.00 | 605525.33 | 5.70 | 169.00 | 3143.77 | false | 0.055744;0.056287;0.056617;0.056627;0.056944 | 177097728;177097728;177097728;177097728;177097728 | 442368;442368;449280;442368;442368 | 606048;605888;604192;604832;605856 | |
109 | densenet1_stage2_batchnorm17_fwd | BatchNorm | [1,192,28,28] | 91 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 3072.00 | 2197.33 | 25.60 | 180.73 | 190.46 | false | 0.256421;0.256569;0.256040;0.255443;0.256286 | 952320;952320;952320;952320;952320 | 3072;3072;4096;3072;3072 | 1536;2560;2336;1696;2816 | |
110 | densenet1_stage2_relu17_fwd | Activation | [1,192,28,28] | 47 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 301056 | 224.00 | 832.00 | 57.40 | 285.09 | 60.21 | false | 0.573880;0.571387;0.577381;0.575986;0.571886 | 301056;301056;301056;301056;301056 | 128;224;224;224;224 | 1184;928;640;768;800 | |
111 | densenet1_stage2_conv17_fwd | Convolution | [1,192,28,28] | 9273.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 105283584 | 0.00 | 542954.67 | 12.50 | 193.91 | 2105.67 | false | 0.124904;0.124902;0.124901;0.124896;0.124900 | 105283584;105283584;105283584;105283584;105283584 | 0;0;0;0;0 | 542656;542464;543104;543104;543872 | |
111 | densenet1_stage2_conv17_fwd | Convolution | [1,192,28,28] | 9273.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 394250.67 | 7.50 | 0.98 | 142.54 | true | 0.075269;0.075256;0.075024;0.075034;0.075127 | 712704;712704;712704;712704;712704 | 393920;394848;394368;394464;393696 | 331776;331776;331776;331776;331776 | |
112 | densenet1_stage2_concat8 | Concat | [1,576,28,28] | 99.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.14 | 0 | 140896.00 | 446778.67 | 46.20 | 0.00 | 0.00 | true | 0.714513;0.206866;0.719340;0.206908;0.719543;0.207191;0.718078;0.207123;0.722347;0.206822 | 0;0;0;0;0;0;0;0;0;0 | 679232;212896;680608;213408;685920;213152;680352;213920;682688;206496 | 294336;0;280768;0;298816;0;279360;0;284992;256 | |
112 | densenet1_stage2_concat8 | Concat | [1,576,28,28] | 99.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 140896.00 | 446778.67 | 46.20 | 0.00 | 0.00 | true | 0.714513;0.206866;0.719340;0.206908;0.719543;0.207191;0.718078;0.207123;0.722347;0.206822 | 0;0;0;0;0;0;0;0;0;0 | 679232;212896;680608;213408;685920;213152;680352;213920;682688;206496 | 294336;0;280768;0;298816;0;279360;0;284992;256 | |
113 | densenet1_stage2_batchnorm18_fwd | BatchNorm | [1,624,28,28] | 118.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 3095040 | 10048.00 | 1866016.00 | 64.80 | 1.65 | 488.72 | true | 0.654511;0.643415;0.660203;0.643674;0.646612 | 3095040;3095040;3095040;3095040;3095040 | 1869504;1860608;1868768;1868672;1856864 | 10880;10080;9984;9984;10080 | |
114 | densenet1_stage2_relu18_fwd | Activation | [1,624,28,28] | 103 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 978432 | 117.33 | 195221.33 | 66.10 | 5.01 | 172.65 | true | 0.660182;0.661513;0.660052;0.662749;0.660505 | 978432;978432;978432;978432;978432 | 224;6784;0;0;128 | 196480;200224;194048;190240;195136 | |
115 | densenet1_stage2_conv18_fwd | Convolution | [1,624,28,28] | 10339.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 60.33 | 191843328 | 479296.00 | 552885.33 | 5.60 | 185.86 | 3179.74 | false | 0.055001;0.055783;0.056961;0.057370;0.056300 | 191843328;191843328;191843328;191843328;191843328 | 563296;548064;553568;557024;546880 | 479296;479232;479296;479296;481088 | |
116 | densenet1_stage2_batchnorm19_fwd | BatchNorm | [1,192,28,28] | 85.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 3072.00 | 189728.00 | 25.60 | 4.94 | 190.46 | true | 0.256363;0.255722;0.255702;0.257635;0.255075 | 952320;952320;952320;952320;952320 | 179296;193440;186656;189088;202240 | 5120;3072;3072;3072;3072 | |
117 | densenet1_stage2_relu19_fwd | Activation | [1,192,28,28] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 301056 | 224.00 | 39936.00 | 58.70 | 7.50 | 64.51 | true | 0.607857;0.577421;0.581863;0.574375;0.603209 | 301056;301056;301056;301056;301056 | 224;224;224;224;2272 | 39872;40960;38976;37952;42304 | |
118 | densenet1_stage2_conv19_fwd | Convolution | [1,192,28,28] | 9273 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.00 | 105283584 | 0.00 | 542976.00 | 12.50 | 193.90 | 2148.64 | false | 0.124899;0.124902;0.124899;0.124902;0.124901 | 105283584;105283584;105283584;105283584;105283584 | 0;0;0;0;0 | 542976;542656;543104;542848;543168 | |
118 | densenet1_stage2_conv19_fwd | Convolution | [1,192,28,28] | 9273 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 415850.67 | 7.50 | 0.95 | 142.54 | true | 0.075234;0.075133;0.075323;0.075222;0.074925 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 416800;416512;416672;414112;414368 | |
119 | densenet1_stage2_concat9 | Concat | [1,624,28,28] | 106 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.86 | 0 | 515136.00 | 692949.33 | 46.30 | 0.00 | 0.00 | true | 0.721554;0.207211;0.719233;0.207123;0.718960;0.207058;0.718597;0.206891;0.717938;0.207168 | 0;0;0;0;0;0;0;0;0;0 | 1169952;216032;1168768;217120;1169088;216736;1172544;209312;1170752;215584 | 1031808;0;1037824;0;1044992;0;1039488;0;1021184;0 | |
119 | densenet1_stage2_concat9 | Concat | [1,624,28,28] | 106 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.14 | 0 | 515136.00 | 692949.33 | 46.30 | 0.00 | 0.00 | true | 0.721554;0.207211;0.719233;0.207123;0.718960;0.207058;0.718597;0.206891;0.717938;0.207168 | 0;0;0;0;0;0;0;0;0;0 | 1169952;216032;1168768;217120;1169088;216736;1172544;209312;1170752;215584 | 1031808;0;1037824;0;1044992;0;1039488;0;1021184;0 | |
120 | densenet1_stage2_batchnorm20_fwd | BatchNorm | [1,672,28,28] | 98 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.33 | 3333120 | 10752.00 | 2155904.00 | 66.00 | 1.54 | 454.54 | true | 0.667863;0.649305;0.671312;0.664229;0.645182 | 3333120;3333120;3333120;3333120;3333120 | 2156736;2152672;2154944;2157952;2156032 | 10848;10752;10752;10752;10752 | |
121 | densenet1_stage2_relu20_fwd | Activation | [1,672,28,28] | 114.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1053696 | 42.67 | 204128.00 | 68.00 | 5.16 | 175.62 | true | 0.677833;0.682487;0.679474;0.685906;0.679357 | 1053696;1053696;1053696;1053696;1053696 | 0;128;0;0;128 | 202528;205600;204256;202240;208544 | |
122 | densenet1_stage2_conv20_fwd | Convolution | [1,672,28,28] | 11137.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.33 | 206588928 | 516096.00 | 594165.33 | 5.60 | 186.07 | 3211.24 | false | 0.057019;0.056015;0.056559;0.055442;0.056923 | 206588928;206588928;206588928;206588928;206588928 | 516096;516096;516096;516096;516096 | 591104;592096;594624;595776;596064 | |
123 | densenet1_stage2_batchnorm21_fwd | BatchNorm | [1,192,28,28] | 86.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 3072.00 | 167445.33 | 25.60 | 5.58 | 190.46 | true | 0.256049;0.256567;0.256857;0.255494;0.255216 | 952320;952320;952320;952320;952320 | 3072;3072;3072;3072;3072 | 168960;167328;167776;164640;167232 | |
124 | densenet1_stage2_relu21_fwd | Activation | [1,192,28,28] | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 301056 | 224.00 | 47765.33 | 57.40 | 6.27 | 69.48 | true | 0.573979;0.572465;0.577374;0.577033;0.571654 | 301056;301056;301056;301056;301056 | 224;224;224;224;224 | 53280;47008;42624;46592;49696 | |
125 | densenet1_stage2_conv21_fwd | Convolution | [1,192,28,28] | 9298.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 105283584 | 0.00 | 540682.67 | 12.50 | 194.72 | 2105.67 | false | 0.124902;0.124905;0.124904;0.124900;0.124902 | 105283584;105283584;105283584;105283584;105283584 | 0;0;0;0;0 | 543904;544352;532896;543136;535008 | |
125 | densenet1_stage2_conv21_fwd | Convolution | [1,192,28,28] | 9298.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 419509.33 | 7.50 | 0.95 | 142.54 | true | 0.075186;0.075227;0.075195;0.075244;0.075248 | 712704;712704;712704;712704;712704 | 413856;420576;417440;420512;426976 | 331776;331776;331776;331776;333824 | |
126 | densenet1_stage2_concat10 | Concat | [1,672,28,28] | 104.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.86 | 0 | 486688.00 | 676256.00 | 46.40 | 0.00 | 0.00 | true | 0.721726;0.207018;0.721228;0.206983;0.719343;0.207289;0.719324;0.207253;0.724050;0.206927 | 0;0;0;0;0;0;0;0;0;0 | 977984;0;973760;0;982720;0;968384;0;1003456;0 | 1208832;158464;1212608;158400;1171872;159264;1199328;159776;1212096;156992 | |
126 | densenet1_stage2_concat10 | Concat | [1,672,28,28] | 104.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.14 | 0 | 486688.00 | 676256.00 | 46.40 | 0.00 | 0.00 | true | 0.721726;0.207018;0.721228;0.206983;0.719343;0.207289;0.719324;0.207253;0.724050;0.206927 | 0;0;0;0;0;0;0;0;0;0 | 977984;0;973760;0;982720;0;968384;0;1003456;0 | 1208832;158464;1212608;158400;1171872;159264;1199328;159776;1212096;156992 | |
127 | densenet1_stage2_batchnorm22_fwd | BatchNorm | [1,720,28,28] | 124.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.67 | 3571200 | 11616.00 | 2299477.33 | 64.60 | 1.55 | 465.79 | true | 0.637266;0.652259;0.649225;0.631807;0.666197 | 3571200;3571200;3571200;3571200;3571200 | 11616;11616;11520;14848;11616 | 2297920;2301152;2299616;2298272;2300544 | |
128 | densenet1_stage2_relu22_fwd | Activation | [1,720,28,28] | 117 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1128960 | 213.33 | 258592.00 | 70.80 | 4.36 | 188.16 | true | 0.704325;0.710314;0.707080;0.713945;0.707584 | 1128960;1128960;1128960;1128960;1128960 | 384;128;6784;128;128 | 241920;252448;259296;264032;267360 | |
129 | densenet1_stage2_conv22_fwd | Convolution | [1,720,28,28] | 11942.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 68.67 | 221334528 | 553024.00 | 550016.00 | 5.60 | 200.66 | 3223.30 | false | 0.056626;0.055875;0.055982;0.055963;0.055385 | 221334528;221334528;221334528;221334528;221334528 | 552960;553024;553984;553024;553024 | 552480;540128;547552;550016;555936 | |
130 | densenet1_stage2_batchnorm23_fwd | BatchNorm | [1,192,28,28] | 88.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 952320 | 3072.00 | 277610.67 | 25.60 | 3.39 | 190.46 | true | 0.255417;0.257677;0.256322;0.255776;0.255239 | 952320;952320;952320;952320;952320 | 3072;3072;3072;3072;3072 | 273568;281024;278720;280544;270880 | |
131 | densenet1_stage2_relu23_fwd | Activation | [1,192,28,28] | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 301056 | 224.00 | 67840.00 | 57.80 | 4.42 | 69.48 | true | 0.576492;0.580743;0.577199;0.586223;0.576678 | 301056;301056;301056;301056;301056 | 64128;68800;69504;65856;68864 | 224;224;224;224;224 | |
132 | densenet1_stage2_conv23_fwd | Convolution | [1,192,28,28] | 9329.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 105283584 | 0.00 | 538506.67 | 12.50 | 195.51 | 2105.67 | false | 0.124901;0.124897;0.124899;0.124901;0.124898 | 105283584;105283584;105283584;105283584;105283584 | 0;0;0;0;0 | 532576;543520;543520;539424;529568 | |
132 | densenet1_stage2_conv23_fwd | Convolution | [1,192,28,28] | 9329.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 332458.67 | 428576.00 | 7.50 | 0.94 | 142.54 | true | 0.075350;0.075192;0.075393;0.075204;0.075304 | 712704;712704;712704;712704;712704 | 331776;331776;331776;333824;338432 | 427392;428576;426336;429760;438080 | |
133 | densenet1_stage2_concat11 | Concat | [1,720,28,28] | 104.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 7.43 | 0 | 905125.33 | 842133.33 | 48.00 | 0.00 | 0.00 | true | 0.754465;0.210419;0.751589;0.207187;0.752424;0.210256;0.747825;0.210132;0.773487;0.207106 | 0;0;0;0;0;0;0;0;0;0 | 1466304;215584;1461184;219680;1458848;222784;1461376;219328;1462368;228928 | 1798528;160;1808128;160;1818496;0;1838976;0;1832576;5280 | |
133 | densenet1_stage2_concat11 | Concat | [1,720,28,28] | 104.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.57 | 0 | 905125.33 | 842133.33 | 48.00 | 0.00 | 0.00 | true | 0.754465;0.210419;0.751589;0.207187;0.752424;0.210256;0.747825;0.210132;0.773487;0.207106 | 0;0;0;0;0;0;0;0;0;0 | 1798528;160;1808128;160;1818496;0;1838976;0;1832576;5280 | 1466304;215584;1461184;219680;1458848;222784;1461376;219328;1462368;228928 | |
134 | densenet1_batchnorm2_fwd | BatchNorm | [1,768,28,28] | 102.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.67 | 3809280 | 12384.00 | 2425717.33 | 62.20 | 1.56 | 496.84 | true | 0.620762;0.625276;0.627021;0.610936;0.619099 | 3809280;3809280;3809280;3809280;3809280 | 12384;12384;12384;12384;12384 | 2428320;2428992;2425920;2422912;2415808 | |
135 | densenet1_relu2_fwd | Activation | [1,768,28,28] | 134 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 1204224 | 128.00 | 291733.33 | 72.80 | 4.13 | 200.70 | true | 0.730655;0.728180;0.726267;0.728386;0.724405 | 1204224;1204224;1204224;1204224;1204224 | 128;896;128;128;128 | 293312;296096;299168;285440;285792 | |
136 | densenet1_conv2_fwd | Convolution | [1,768,28,28] | 24966 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 77.33 | 472160256 | 1189248.00 | 1094496.00 | 10.80 | 206.75 | 6105.55 | false | 0.106172;0.109399;0.108048;0.107877;0.106907 | 472160256;472160256;472160256;472160256;472160256 | 1188992;1181696;1186880;1200064;1191872 | 1093440;1075968;1090144;1101248;1099904 | |
137 | densenet1_pool2_fwd | Pooling | [1,384,28,28] | 847 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 6.00 | 1580544 | 0.00 | 426709.33 | 24.80 | 3.70 | 263.42 | true | 0.247617;0.248100;0.248200;0.247538;0.247636 | 1580544;1580544;1580544;1580544;1580544 | 423264;436320;427072;429792;420192 | 0;0;0;0;0 | |
138 | densenet1_stage3_batchnorm0_fwd | BatchNorm | [1,384,14,14] | 82.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 476160 | 6144.00 | 262581.33 | 12.30 | 1.77 | 95.23 | true | 0.123257;0.123443;0.123689;0.124011;0.120350 | 476160;476160;476160;476160;476160 | 6144;6144;6144;6144;6144 | 262112;262368;261664;263264;264000 | |
139 | densenet1_stage3_relu0_fwd | Activation | [1,384,14,14] | 22.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 150528 | 0.00 | 39744.00 | 41.80 | 3.79 | 37.63 | true | 0.419285;0.418215;0.417958;0.417608;0.418347 | 150528;150528;150528;150528;150528 | 0;0;0;0;0 | 40320;39680;40832;39232;38048 | |
140 | densenet1_stage3_conv0_fwd | Convolution | [1,384,14,14] | 1671.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 28.33 | 33067776 | 294912.00 | 121856.00 | 3.10 | 79.34 | 1167.11 | false | 0.031246;0.031246;0.031246;0.031247;0.031247 | 33067776;33067776;33067776;33067776;33067776 | 121376;123232;119072;122816;121376 | 294912;294912;294912;294912;294912 | |
141 | densenet1_stage3_batchnorm1_fwd | BatchNorm | [1,192,14,14] | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 153077.33 | 6.80 | 1.52 | 51.01 | true | 0.068286;0.068065;0.068190;0.068150;0.068176 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 152992;152992;155616;151648;153248 | |
142 | densenet1_stage3_relu1_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 26656.00 | 33.90 | 2.80 | 18.82 | true | 0.337567;0.338716;0.338808;0.348422;0.338040 | 75264;75264;75264;75264;75264 | 28160;25760;26592;26080;27296 | 224;224;224;224;224 | |
143 | densenet1_stage3_conv1_fwd | Convolution | [1,192,14,14] | 2453.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 392522.67 | 12.50 | 67.06 | 526.42 | false | 0.124924;0.124924;0.124927;0.124929;0.124917 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 392608;392128;391616;392832;392960 | |
143 | densenet1_stage3_conv1_fwd | Convolution | [1,192,14,14] | 2453.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 393749.33 | 7.50 | 0.98 | 142.54 | true | 0.075295;0.075244;0.075243;0.075160;0.075206 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 393664;394016;395072;393568;393120 | |
144 | densenet1_stage3_concat0 | Concat | [1,384,14,14] | 49 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 0.00 | 76933.33 | 23.60 | 0.00 | 0.00 | true | 0.350424;0.121896;0.350617;0.121798;0.350428;0.121838;0.350535;0.121810;0.350484;0.121793 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;1280;0 | 166560;4352;161120;4224;157536;4992;128736;4864;161376;4224 | |
144 | densenet1_stage3_concat0 | Concat | [1,384,14,14] | 49 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 0.00 | 76933.33 | 23.60 | 0.00 | 0.00 | true | 0.350424;0.121896;0.350617;0.121798;0.350428;0.121838;0.350535;0.121810;0.350484;0.121793 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;1280;0 | 166560;4352;161120;4224;157536;4992;128736;4864;161376;4224 | |
145 | densenet1_stage3_batchnorm2_fwd | BatchNorm | [1,432,14,14] | 46 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 535680 | 6912.00 | 52042.67 | 13.40 | 9.09 | 107.14 | true | 0.134495;0.132229;0.135920;0.133772;0.134289 | 535680;535680;535680;535680;535680 | 13568;6912;6912;6912;6912 | 51008;54208;51488;51776;52864 | |
146 | densenet1_stage3_relu2_fwd | Activation | [1,432,14,14] | 23.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 169344 | 0.00 | 3413.33 | 44.00 | 49.61 | 36.29 | false | 0.440972;0.438749;0.440877;0.440234;0.440095 | 169344;169344;169344;169344;169344 | 3712;3456;3200;3328;3456 | 0;0;256;0;0 | |
147 | densenet1_stage3_conv2_fwd | Convolution | [1,432,14,14] | 1878 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 31.00 | 37196544 | 331776.00 | 320.00 | 3.10 | 112.01 | 1199.89 | false | 0.031247;0.031246;0.031246;0.031247;0.031246 | 37196544;37196544;37196544;37196544;37196544 | 331776;336896;331776;331776;331776 | 320;320;320;320;448 | |
148 | densenet1_stage3_batchnorm3_fwd | BatchNorm | [1,192,14,14] | 65.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 0.00 | 6.80 | 77.50 | 47.62 | false | 0.068741;0.068130;0.068146;0.068009;0.067934 | 238080;238080;238080;238080;238080 | 0;128;0;0;0 | 3072;3072;3072;3072;3072 | |
149 | densenet1_stage3_relu3_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 85.33 | 128.00 | 31.70 | 352.80 | 18.82 | false | 0.319289;0.317652;0.315697;0.316098;0.317038 | 75264;75264;75264;75264;75264 | 128;128;128;0;128 | 0;256;0;5376;0 | |
150 | densenet1_stage3_conv3_fwd | Convolution | [1,192,14,14] | 2443.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 0.00 | 12.50 | 0.00 | 526.42 | true | 0.124914;0.124914;0.124917;0.124914;0.124909 | 26320896;26320896;26320896;26320896;26320896 | 0;0;768;0;0 | 0;0;0;0;128 | |
150 | densenet1_stage3_conv3_fwd | Convolution | [1,192,14,14] | 2443.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331861.33 | 42.67 | 7.50 | 2.15 | 142.54 | true | 0.075274;0.075173;0.075504;0.075311;0.075179 | 712704;712704;712704;712704;712704 | 331776;331776;338432;331776;332032 | 128;0;0;128;0 | |
151 | densenet1_stage3_concat1 | Concat | [1,432,14,14] | 47 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 0.00 | 16384.00 | 25.10 | 0.00 | 0.00 | true | 0.381577;0.121810;0.384192;0.121786;0.380479;0.121789;0.381485;0.121780;0.380935;0.121787 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 0;32640;128;32768;0;32896;0;32768;0;32768 | |
151 | densenet1_stage3_concat1 | Concat | [1,432,14,14] | 47 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 0.00 | 16384.00 | 25.10 | 0.00 | 0.00 | true | 0.381577;0.121810;0.384192;0.121786;0.380479;0.121789;0.381485;0.121780;0.380935;0.121787 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 0;32640;128;32768;0;32896;0;32768;0;32768 | |
152 | densenet1_stage3_batchnorm4_fwd | BatchNorm | [1,480,14,14] | 45.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 595200 | 7680.00 | 72149.33 | 14.40 | 7.46 | 119.04 | true | 0.144543;0.143283;0.148153;0.142889;0.142106 | 595200;595200;595200;595200;595200 | 73696;73408;70656;70496;72384 | 7680;7680;7680;12288;7680 | |
153 | densenet1_stage3_relu4_fwd | Activation | [1,480,14,14] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 188160 | 0.00 | 9898.67 | 46.30 | 19.01 | 47.04 | false | 0.464923;0.461839;0.473403;0.461692;0.460225 | 188160;188160;188160;188160;188160 | 0;0;0;0;5632 | 9472;9344;10112;10240;10112 | |
154 | densenet1_stage3_conv4_fwd | Convolution | [1,480,14,14] | 2070.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 34.00 | 41325312 | 368640.00 | 426.67 | 3.10 | 111.97 | 1215.45 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 41325312;41325312;41325312;41325312;41325312 | 372224;368640;368640;368640;368640 | 512;448;320;576;320 | |
155 | densenet1_stage3_batchnorm5_fwd | BatchNorm | [1,192,14,14] | 62 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 238080 | 3072.00 | 1674.67 | 6.80 | 50.16 | 54.95 | false | 0.068160;0.067328;0.068246;0.068287;0.068357 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 2176;2336;512;384;2592 | |
156 | densenet1_stage3_relu5_fwd | Activation | [1,192,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 0.00 | 288.00 | 31.70 | 261.33 | 18.82 | false | 0.317317;0.315953;0.317467;0.317645;0.316594 | 75264;75264;75264;75264;75264 | 0;0;0;1536;0 | 448;288;64;192;384 | |
157 | densenet1_stage3_conv5_fwd | Convolution | [1,192,14,14] | 2434 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.33 | 26320896 | 0.00 | 1184.00 | 12.50 | 22230.49 | 533.54 | false | 0.124927;0.124929;0.124926;0.124930;0.124930 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 1440;1760;288;416;1696 | |
157 | densenet1_stage3_conv5_fwd | Convolution | [1,192,14,14] | 2434 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 1056.00 | 7.50 | 2.14 | 142.54 | true | 0.075301;0.075229;0.075098;0.075485;0.075281 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 1280;1024;1152;992;768 | |
158 | densenet1_stage3_concat2 | Concat | [1,480,14,14] | 46.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 0.00 | 9637.33 | 26.50 | 0.00 | 0.00 | true | 0.408620;0.121811;0.408406;0.121927;0.407943;0.121820;0.408664;0.121867;0.409049;0.121882 | 0;0;0;0;0;0;0;0;0;0 | 2144;17344;2400;17152;416;17088;416;16640;2400;17280 | 2048;0;256;0;0;0;0;0;0;0 | |
158 | densenet1_stage3_concat2 | Concat | [1,480,14,14] | 46.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 0.00 | 9637.33 | 26.50 | 0.00 | 0.00 | true | 0.408620;0.121811;0.408406;0.121927;0.407943;0.121820;0.408664;0.121867;0.409049;0.121882 | 0;0;0;0;0;0;0;0;0;0 | 2048;0;256;0;0;0;0;0;0;0 | 2144;17344;2400;17152;416;17088;416;16640;2400;17280 | |
159 | densenet1_stage3_batchnorm6_fwd | BatchNorm | [1,528,14,14] | 47 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.33 | 654720 | 8448.00 | 35360.00 | 15.60 | 14.95 | 122.77 | true | 0.157470;0.155570;0.157110;0.154381;0.156263 | 654720;654720;654720;654720;654720 | 35488;35008;34432;35584;35680 | 8448;8448;8448;8448;8448 | |
160 | densenet1_stage3_relu6_fwd | Activation | [1,528,14,14] | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 206976 | 0.00 | 4736.00 | 48.30 | 43.70 | 51.74 | false | 0.482253;0.483614;0.484421;0.483765;0.482361 | 206976;206976;206976;206976;206976 | 0;0;0;0;0 | 4544;5120;5504;4224;4544 | |
161 | densenet1_stage3_conv6_fwd | Convolution | [1,528,14,14] | 2281 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 45454080 | 405504.00 | 7552.00 | 3.10 | 110.04 | 1228.49 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 45454080;45454080;45454080;45454080;45454080 | 405504;405504;405504;405504;405504 | 7968;7648;8544;7040;4832 | |
162 | densenet1_stage3_batchnorm7_fwd | BatchNorm | [1,192,14,14] | 58.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 170.67 | 6.80 | 73.42 | 47.62 | false | 0.068539;0.068192;0.067998;0.068240;0.068144 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 256;128;128;384;128 | |
163 | densenet1_stage3_relu7_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 0.00 | 85.33 | 31.70 | 882.00 | 18.82 | false | 0.317935;0.317222;0.316562;0.316855;0.316821 | 75264;75264;75264;75264;75264 | 128;128;0;0;256 | 0;0;0;0;0 | |
164 | densenet1_stage3_conv7_fwd | Convolution | [1,192,14,14] | 2457 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.33 | 26320896 | 0.00 | 170.67 | 12.50 | 154223.70 | 533.54 | false | 0.124913;0.124916;0.124910;0.124913;0.124912 | 26320896;26320896;26320896;26320896;26320896 | 0;128;384;0;384 | 0;0;0;0;0 | |
164 | densenet1_stage3_conv7_fwd | Convolution | [1,192,14,14] | 2457 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 1856.00 | 7.50 | 2.14 | 142.54 | true | 0.075182;0.075337;0.075305;0.075157;0.075212 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 1792;1728;1536;2048;2304 | |
165 | densenet1_stage3_concat3 | Concat | [1,528,14,14] | 50.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 0.00 | 15317.33 | 27.60 | 0.00 | 0.00 | true | 0.429320;0.121809;0.430677;0.121716;0.430896;0.121820;0.431500;0.121980;0.431333;0.121842 | 0;0;0;0;0;0;0;0;0;0 | 128;31680;0;30848;0;35904;128;30080;0;30720 | 0;6656;0;0;0;0;0;0;0;0 | |
165 | densenet1_stage3_concat3 | Concat | [1,528,14,14] | 50.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 0.00 | 15317.33 | 27.60 | 0.00 | 0.00 | true | 0.429320;0.121809;0.430677;0.121716;0.430896;0.121820;0.431500;0.121980;0.431333;0.121842 | 0;0;0;0;0;0;0;0;0;0 | 128;31680;0;30848;0;35904;128;30080;0;30720 | 0;6656;0;0;0;0;0;0;0;0 | |
166 | densenet1_stage3_batchnorm8_fwd | BatchNorm | [1,576,14,14] | 50 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.33 | 714240 | 9216.00 | 98229.33 | 16.60 | 6.65 | 133.93 | true | 0.165126;0.165350;0.165050;0.167697;0.166705 | 714240;714240;714240;714240;714240 | 9216;9216;9728;9216;9216 | 99872;97248;103040;96448;97568 | |
167 | densenet1_stage3_relu8_fwd | Activation | [1,576,14,14] | 28.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 225792 | 0.00 | 12970.67 | 49.70 | 17.41 | 52.11 | true | 0.499169;0.496326;0.499067;0.495695;0.496626 | 225792;225792;225792;225792;225792 | 0;0;0;0;0 | 13056;12800;13312;13056;12800 | |
168 | densenet1_stage3_conv8_fwd | Convolution | [1,576,14,14] | 2519.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 39.67 | 49582848 | 442368.00 | 109674.67 | 3.10 | 89.82 | 1249.98 | false | 0.031248;0.031247;0.031247;0.031248;0.031248 | 49582848;49582848;49582848;49582848;49582848 | 442368;442368;442368;442368;442368 | 107296;112288;97568;110912;110816 | |
169 | densenet1_stage3_batchnorm9_fwd | BatchNorm | [1,192,14,14] | 63 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 238080 | 3072.00 | 37642.67 | 6.80 | 5.85 | 54.95 | true | 0.067752;0.067773;0.066830;0.067672;0.068075 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3200 | 38304;38208;36640;36480;38080 | |
170 | densenet1_stage3_relu9_fwd | Activation | [1,192,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 0.00 | 5600.00 | 31.60 | 13.44 | 18.82 | true | 0.316081;0.315242;0.316241;0.317415;0.316288 | 75264;75264;75264;75264;75264 | 5696;5792;5184;5440;5664 | 0;0;6656;0;0 | |
171 | densenet1_stage3_conv9_fwd | Convolution | [1,192,14,14] | 2441.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 34293.33 | 12.50 | 767.52 | 526.42 | false | 0.124919;0.124929;0.124911;0.124924;0.124927 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 32672;36256;33568;34208;35104 | |
171 | densenet1_stage3_conv9_fwd | Convolution | [1,192,14,14] | 2441.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 166528.00 | 7.50 | 1.43 | 142.54 | true | 0.075342;0.075442;0.075185;0.075256;0.075277 | 712704;712704;712704;712704;712704 | 331776;331776;332544;331776;331776 | 169728;165632;166912;163840;167040 | |
172 | densenet1_stage3_concat4 | Concat | [1,576,14,14] | 49.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 0.00 | 41152.00 | 28.60 | 0.00 | 0.00 | true | 0.451457;0.121853;0.450699;0.121832;0.451490;0.121873;0.450059;0.121990;0.452028;0.121780 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 76096;4992;79616;5376;76480;5120;80320;6144;77696;4992 | |
172 | densenet1_stage3_concat4 | Concat | [1,576,14,14] | 49.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 0.00 | 41152.00 | 28.60 | 0.00 | 0.00 | true | 0.451457;0.121853;0.450699;0.121832;0.451490;0.121873;0.450059;0.121990;0.452028;0.121780 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 76096;4992;79616;5376;76480;5120;80320;6144;77696;4992 | |
173 | densenet1_stage3_batchnorm10_fwd | BatchNorm | [1,624,14,14] | 51 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 773760 | 9984.00 | 54442.67 | 17.60 | 12.01 | 136.54 | true | 0.180762;0.176530;0.176389;0.174407;0.174557 | 773760;773760;773760;773760;773760 | 54528;56704;53888;54912;52992 | 9984;9984;9984;9984;10240 | |
174 | densenet1_stage3_relu10_fwd | Activation | [1,624,14,14] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 244608 | 0.00 | 2986.67 | 51.00 | 81.90 | 52.41 | false | 0.509904;0.508454;0.513095;0.511726;0.509844 | 244608;244608;244608;244608;244608 | 0;0;0;0;0 | 2816;3200;2176;2944;3456 | |
175 | densenet1_stage3_conv10_fwd | Convolution | [1,624,14,14] | 2734.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 44.00 | 53711616 | 479232.00 | 166954.67 | 3.10 | 83.12 | 1220.72 | false | 0.031247;0.031248;0.031248;0.031248;0.031248 | 53711616;53711616;53711616;53711616;53711616 | 166784;168128;165696;165952;169408 | 479232;479232;479232;479232;479232 | |
176 | densenet1_stage3_batchnorm11_fwd | BatchNorm | [1,192,14,14] | 65.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 0.00 | 6.80 | 77.50 | 51.01 | false | 0.067741;0.067939;0.067757;0.066323;0.067787 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 0;0;0;0;0 | |
177 | densenet1_stage3_relu11_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 0.00 | 42.67 | 31.80 | 1763.99 | 18.82 | false | 0.319151;0.317125;0.317843;0.317719;0.318049 | 75264;75264;75264;75264;75264 | 0;0;0;0;0 | 0;128;0;0;128 | |
178 | densenet1_stage3_conv11_fwd | Convolution | [1,192,14,14] | 2435 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 22528.00 | 12.50 | 1168.36 | 529.95 | false | 0.124916;0.124908;0.124907;0.124916;0.124913 | 26320896;26320896;26320896;26320896;26320896 | 4992;0;0;0;0 | 24704;22144;23296;22144;21632 | |
178 | densenet1_stage3_conv11_fwd | Convolution | [1,192,14,14] | 2435 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331818.67 | 6016.00 | 7.50 | 2.11 | 142.54 | true | 0.075209;0.075254;0.075379;0.075306;0.074870 | 712704;712704;712704;712704;712704 | 6016;6400;5632;4224;7552 | 331904;331776;331776;331776;333824 | |
179 | densenet1_stage3_concat5 | Concat | [1,624,14,14] | 52 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.14 | 0 | 0.00 | 7594.67 | 29.50 | 0.00 | 0.00 | true | 0.475567;0.121866;0.467647;0.121795;0.471144;0.121792;0.469399;0.121941;0.468519;0.121840 | 0;0;0;0;0;0;0;0;0;0 | 0;0;5120;0;0;0;0;0;0;0 | 0;15232;0;15232;0;15488;0;15232;0;15104 | |
179 | densenet1_stage3_concat5 | Concat | [1,624,14,14] | 52 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 0.00 | 7594.67 | 29.50 | 0.00 | 0.00 | true | 0.475567;0.121866;0.467647;0.121795;0.471144;0.121792;0.469399;0.121941;0.468519;0.121840 | 0;0;0;0;0;0;0;0;0;0 | 0;0;5120;0;0;0;0;0;0;0 | 0;15232;0;15232;0;15488;0;15232;0;15104 | |
180 | densenet1_stage3_batchnorm12_fwd | BatchNorm | [1,672,14,14] | 55.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 833280 | 10752.00 | 196437.33 | 18.30 | 4.02 | 147.04 | true | 0.182660;0.183354;0.182666;0.183648;0.192024 | 833280;833280;833280;833280;833280 | 10752;10752;10752;10752;10752 | 196064;196640;196608;195840;198784 | |
181 | densenet1_stage3_relu12_fwd | Activation | [1,672,14,14] | 33 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 263424 | 85.33 | 12245.33 | 52.00 | 21.36 | 52.68 | false | 0.519086;0.519996;0.519564;0.516977;0.519958 | 263424;263424;263424;263424;263424 | 12928;12288;12416;12032;11392 | 0;256;0;512;0 | |
182 | densenet1_stage3_conv12_fwd | Convolution | [1,672,14,14] | 2930.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 46.00 | 57840384 | 516096.00 | 10005.33 | 3.10 | 109.94 | 1257.40 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 57840384;57840384;57840384;57840384;57840384 | 516096;516096;516096;516096;516096 | 10976;10176;8896;9920;9920 | |
183 | densenet1_stage3_batchnorm13_fwd | BatchNorm | [1,192,14,14] | 56.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 41205.33 | 6.80 | 5.38 | 51.01 | true | 0.068024;0.068552;0.068338;0.068144;0.068190 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 41600;41600;40416;40032;41728 | |
184 | densenet1_stage3_relu13_fwd | Activation | [1,192,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 0.00 | 6101.33 | 31.70 | 12.34 | 18.82 | true | 0.316596;0.316514;0.316235;0.318939;0.316693 | 75264;75264;75264;75264;75264 | 0;0;0;0;0 | 6304;6176;5440;5824;6912 | |
185 | densenet1_stage3_conv13_fwd | Convolution | [1,192,14,14] | 2440 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 97866.67 | 12.50 | 268.95 | 526.42 | false | 0.124933;0.124930;0.124932;0.124925;0.124921 | 26320896;26320896;26320896;26320896;26320896 | 97312;98080;98336;96288;98208 | 0;0;0;256;0 | |
185 | densenet1_stage3_conv13_fwd | Convolution | [1,192,14,14] | 2440 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.33 | 712704 | 331776.00 | 101802.67 | 7.50 | 1.64 | 164.48 | true | 0.075326;0.075176;0.075103;0.075269;0.075235 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 103072;102016;100352;102464;100928 | |
186 | densenet1_stage3_concat6 | Concat | [1,672,14,14] | 49.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.29 | 0 | 0.00 | 117946.67 | 30.70 | 0.00 | 0.00 | true | 0.492293;0.121787;0.492261;0.121801;0.493539;0.121861;0.491111;0.121857;0.491620;0.121803 | 0;0;0;0;0;0;0;0;0;0 | 0;256;0;0;0;256;0;0;0;0 | 208320;30368;205536;29984;210112;30528;204000;30656;206592;30368 | |
186 | densenet1_stage3_concat6 | Concat | [1,672,14,14] | 49.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 0.00 | 117946.67 | 30.70 | 0.00 | 0.00 | true | 0.492293;0.121787;0.492261;0.121801;0.493539;0.121861;0.491111;0.121857;0.491620;0.121803 | 0;0;0;0;0;0;0;0;0;0 | 0;256;0;0;0;256;0;0;0;0 | 208320;30368;205536;29984;210112;30528;204000;30656;206592;30368 | |
187 | densenet1_stage3_batchnorm14_fwd | BatchNorm | [1,720,14,14] | 54.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.33 | 892800 | 11520.00 | 140437.33 | 19.40 | 5.88 | 167.41 | true | 0.193554;0.186672;0.194661;0.194010;0.193194 | 892800;892800;892800;892800;892800 | 11520;11520;11520;11520;11520 | 137888;144672;137600;140064;143360 | |
188 | densenet1_stage3_relu14_fwd | Activation | [1,720,14,14] | 34.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 282240 | 0.00 | 8288.00 | 52.80 | 34.05 | 65.14 | false | 0.526938;0.529460;0.526728;0.527767;0.529366 | 282240;282240;282240;282240;282240 | 9248;7584;8320;8448;8096 | 0;0;0;0;0 | |
189 | densenet1_stage3_conv14_fwd | Convolution | [1,720,14,14] | 3150 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 49.00 | 61969152 | 552960.00 | 3168.00 | 3.10 | 111.43 | 1264.68 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 61969152;61969152;61969152;61969152;61969152 | 552960;552960;552960;552960;552960 | 4192;3680;1760;1632;4064 | |
190 | densenet1_stage3_batchnorm15_fwd | BatchNorm | [1,192,14,14] | 58.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 7125.33 | 6.80 | 23.35 | 47.62 | false | 0.068055;0.067980;0.068206;0.068307;0.068065 | 238080;238080;238080;238080;238080 | 7168;7168;6912;7040;7168 | 3072;3072;3072;3072;3072 | |
191 | densenet1_stage3_relu15_fwd | Activation | [1,192,14,14] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 0.00 | 640.00 | 31.70 | 117.60 | 18.82 | false | 0.330457;0.315672;0.316872;0.317179;0.317370 | 75264;75264;75264;75264;75264 | 0;0;6144;0;0 | 768;640;512;512;1152 | |
192 | densenet1_stage3_conv15_fwd | Convolution | [1,192,14,14] | 2439.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 29226.67 | 12.50 | 900.58 | 529.95 | false | 0.124910;0.124914;0.124910;0.124915;0.124923 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 28416;28800;29184;33920;29696 | |
192 | densenet1_stage3_conv15_fwd | Convolution | [1,192,14,14] | 2439.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 5557.33 | 7.50 | 2.11 | 142.54 | true | 0.075244;0.075237;0.075256;0.075352;0.075193 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 8480;6944;4320;1952;5408 | |
193 | densenet1_stage3_concat7 | Concat | [1,720,14,14] | 54.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.43 | 0 | 0.00 | 14474.67 | 31.80 | 0.00 | 0.00 | true | 0.515129;0.121897;0.512125;0.121778;0.517887;0.121742;0.514995;0.121803;0.512650;0.121929 | 0;0;0;0;0;0;0;0;0;0 | 256;28544;0;28864;0;28928;256;28928;0;29376 | 0;0;0;0;0;0;0;0;0;0 | |
193 | densenet1_stage3_concat7 | Concat | [1,720,14,14] | 54.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.14 | 0 | 0.00 | 14474.67 | 31.80 | 0.00 | 0.00 | true | 0.515129;0.121897;0.512125;0.121778;0.517887;0.121742;0.514995;0.121803;0.512650;0.121929 | 0;0;0;0;0;0;0;0;0;0 | 256;28544;0;28864;0;28928;256;28928;0;29376 | 0;0;0;0;0;0;0;0;0;0 | |
194 | densenet1_stage3_batchnorm16_fwd | BatchNorm | [1,768,14,14] | 56.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 952320 | 12288.00 | 388437.33 | 19.30 | 2.38 | 168.05 | true | 0.192657;0.194891;0.198493;0.192063;0.192002 | 952320;952320;952320;952320;952320 | 389472;390688;388896;386144;386944 | 12288;12288;12288;12288;12288 | |
195 | densenet1_stage3_relu16_fwd | Activation | [1,768,14,14] | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 301056 | 0.00 | 19168.00 | 53.10 | 15.71 | 60.21 | true | 0.532768;0.531622;0.530687;0.524785;0.530261 | 301056;301056;301056;301056;301056 | 19392;19008;19104;18592;20992 | 0;0;0;0;0 | |
196 | densenet1_stage3_conv16_fwd | Convolution | [1,768,14,14] | 3336.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 52.00 | 66097920 | 589824.00 | 90112.00 | 3.10 | 97.21 | 1271.11 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 66097920;66097920;66097920;66097920;66097920 | 589824;589824;589824;589824;589824 | 89824;88672;90848;92256;89664 | |
197 | densenet1_stage3_batchnorm17_fwd | BatchNorm | [1,192,14,14] | 60.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 101578.67 | 6.80 | 2.27 | 47.62 | true | 0.068079;0.068216;0.067978;0.068233;0.067871 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 101024;102176;101408;101664;101664 | |
198 | densenet1_stage3_relu17_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 85.33 | 11306.67 | 31.70 | 6.61 | 18.82 | true | 0.317022;0.315323;0.318701;0.317838;0.316115 | 75264;75264;75264;75264;75264 | 12544;10752;11392;11136;11392 | 4864;256;0;0;0 | |
199 | densenet1_stage3_conv17_fwd | Convolution | [1,192,14,14] | 2455 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 190336.00 | 12.50 | 138.29 | 529.95 | false | 0.124911;0.124922;0.124927;0.124923;0.124931 | 26320896;26320896;26320896;26320896;26320896 | 171648;189312;191744;191744;189952 | 0;0;0;0;0 | |
199 | densenet1_stage3_conv17_fwd | Convolution | [1,192,14,14] | 2455 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 186517.33 | 7.50 | 1.38 | 142.54 | true | 0.075207;0.075257;0.075258;0.075123;0.075198 | 712704;712704;712704;712704;712704 | 193440;185728;187168;186656;183840 | 331776;331776;331776;331776;331776 | |
200 | densenet1_stage3_concat8 | Concat | [1,768,14,14] | 52.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.29 | 0 | 0.00 | 182197.33 | 32.60 | 0.00 | 0.00 | true | 0.531020;0.121854;0.529932;0.121785;0.531485;0.121828;0.530587;0.121792;0.530801;0.121775 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 342048;20480;343904;20864;347872;20864;345440;20736;344640;20864 | |
200 | densenet1_stage3_concat8 | Concat | [1,768,14,14] | 52.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.14 | 0 | 0.00 | 182197.33 | 32.60 | 0.00 | 0.00 | true | 0.531020;0.121854;0.529932;0.121785;0.531485;0.121828;0.530587;0.121792;0.530801;0.121775 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 342048;20480;343904;20864;347872;20864;345440;20736;344640;20864 | |
201 | densenet1_stage3_batchnorm18_fwd | BatchNorm | [1,816,14,14] | 53.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 1011840 | 13141.33 | 262965.33 | 20.90 | 3.66 | 178.55 | true | 0.215921;0.215158;0.204747;0.207076;0.205363 | 1011840;1011840;1011840;1011840;1011840 | 13056;13312;13056;13824;13056 | 262720;265152;261920;257920;264256 | |
202 | densenet1_stage3_relu18_fwd | Activation | [1,816,14,14] | 38.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 319872 | 0.00 | 11232.00 | 53.20 | 28.48 | 68.54 | false | 0.525179;0.530805;0.533672;0.534861;0.531039 | 319872;319872;319872;319872;319872 | 11264;9088;11648;11168;11264 | 0;0;0;0;0 | |
203 | densenet1_stage3_conv18_fwd | Convolution | [1,816,14,14] | 3551.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 55.00 | 70226688 | 626688.00 | 11658.67 | 3.10 | 110.01 | 1276.85 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 70226688;70226688;70226688;70226688;70226688 | 626688;626688;626688;626688;631808 | 13440;12256;10592;10336;12128 | |
204 | densenet1_stage3_batchnorm19_fwd | BatchNorm | [1,192,14,14] | 56.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 39338.67 | 6.80 | 5.61 | 47.62 | true | 0.068118;0.066466;0.067211;0.068176;0.068303 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 39168;39424;39168;39424;39424 | |
205 | densenet1_stage3_relu19_fwd | Activation | [1,192,14,14] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 0.00 | 4309.33 | 31.60 | 17.47 | 18.82 | false | 0.319786;0.316903;0.315825;0.315446;0.316101 | 75264;75264;75264;75264;75264 | 0;0;0;0;0 | 4352;4352;4224;3840;4480 | |
206 | densenet1_stage3_conv19_fwd | Convolution | [1,192,14,14] | 2454 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 59946.67 | 12.50 | 439.07 | 529.95 | false | 0.124915;0.124912;0.124914;0.124910;0.124908 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 61824;45056;58240;59776;63616 | |
206 | densenet1_stage3_conv19_fwd | Convolution | [1,192,14,14] | 2454 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 37920.00 | 7.50 | 1.93 | 142.54 | true | 0.075112;0.075114;0.075199;0.075201;0.075178 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;333824 | 39840;55840;37152;36512;36768 | |
207 | densenet1_stage3_concat9 | Concat | [1,816,14,14] | 58.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.57 | 0 | 0.00 | 6144.00 | 33.30 | 0.00 | 0.00 | true | 0.543592;0.121837;0.544130;0.121818;0.542940;0.121898;0.544695;0.121735;0.543969;0.121942 | 0;0;0;0;0;0;0;0;0;0 | 256;0;0;0;1280;0;0;0;0;0 | 0;12288;128;12288;0;12288;0;12160;0;12544 | |
207 | densenet1_stage3_concat9 | Concat | [1,816,14,14] | 58.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.29 | 0 | 0.00 | 6144.00 | 33.30 | 0.00 | 0.00 | true | 0.543592;0.121837;0.544130;0.121818;0.542940;0.121898;0.544695;0.121735;0.543969;0.121942 | 0;0;0;0;0;0;0;0;0;0 | 256;0;0;0;1280;0;0;0;0;0 | 0;12288;128;12288;0;12288;0;12160;0;12544 | |
208 | densenet1_stage3_batchnorm20_fwd | BatchNorm | [1,864,14,14] | 59 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 1071360 | 13824.00 | 479445.33 | 20.90 | 2.17 | 178.56 | true | 0.217716;0.208587;0.208855;0.209555;0.207974 | 1071360;1071360;1071360;1071360;1071360 | 478016;478880;479552;481088;479904 | 13824;13824;13824;13824;13824 | |
209 | densenet1_stage3_relu20_fwd | Activation | [1,864,14,14] | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 338688 | 0.00 | 23264.00 | 51.90 | 14.56 | 67.74 | true | 0.514688;0.516102;0.522232;0.519020;0.521473 | 338688;338688;338688;338688;338688 | 0;0;0;0;0 | 23488;23200;22560;23264;23328 | |
210 | densenet1_stage3_conv20_fwd | Convolution | [1,864,14,14] | 3761.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 74355456 | 663552.00 | 123061.33 | 3.10 | 94.53 | 1281.99 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 74355456;74355456;74355456;74355456;74355456 | 663552;663552;663552;663552;663552 | 123488;122592;122496;123104;124384 | |
211 | densenet1_stage3_batchnorm21_fwd | BatchNorm | [1,192,14,14] | 59 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 102218.67 | 6.80 | 2.26 | 51.01 | true | 0.068280;0.068344;0.068196;0.068192;0.068278 | 238080;238080;238080;238080;238080 | 3072;3072;13568;3072;3072 | 102560;102048;103584;102048;101792 | |
212 | densenet1_stage3_relu21_fwd | Activation | [1,192,14,14] | 16.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 0.00 | 11093.33 | 31.80 | 6.78 | 18.82 | true | 0.317690;0.316427;0.319945;0.317304;0.317670 | 75264;75264;75264;75264;75264 | 0;0;0;0;0 | 10880;11264;10368;11136;11648 | |
213 | densenet1_stage3_conv21_fwd | Convolution | [1,192,14,14] | 2442.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.00 | 26320896 | 0.00 | 230325.33 | 12.50 | 114.28 | 537.16 | false | 0.124931;0.124934;0.124929;0.124921;0.124933 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;2304 | 226688;229632;233088;231072;230272 | |
213 | densenet1_stage3_conv21_fwd | Convolution | [1,192,14,14] | 2442.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 712704 | 331776.00 | 241717.33 | 7.50 | 1.24 | 152.71 | true | 0.075265;0.075265;0.075198;0.075255;0.075141 | 712704;712704;712704;712704;712704 | 333568;331776;331776;331776;331776 | 244416;241856;240032;242464;240832 | |
214 | densenet1_stage3_concat10 | Concat | [1,864,14,14] | 55 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.57 | 0 | 0.00 | 231077.33 | 33.90 | 0.00 | 0.00 | true | 0.557005;0.121996;0.557103;0.121870;0.554830;0.121816;0.555729;0.121893;0.557056;0.121845 | 0;0;0;0;0;0;0;0;0;0 | 441184;18560;443904;18688;447424;18816;444544;19456;444160;18944 | 0;0;0;0;0;0;256;0;0;0 | |
214 | densenet1_stage3_concat10 | Concat | [1,864,14,14] | 55 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.14 | 0 | 0.00 | 231077.33 | 33.90 | 0.00 | 0.00 | true | 0.557005;0.121996;0.557103;0.121870;0.554830;0.121816;0.555729;0.121893;0.557056;0.121845 | 0;0;0;0;0;0;0;0;0;0 | 441184;18560;443904;18688;447424;18816;444544;19456;444160;18944 | 0;0;0;0;0;0;256;0;0;0 | |
215 | densenet1_stage3_batchnorm22_fwd | BatchNorm | [1,912,14,14] | 56.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 1130880 | 14592.00 | 296981.33 | 22.10 | 3.63 | 188.48 | true | 0.219563;0.220789;0.222126;0.221808;0.212788 | 1130880;1130880;1130880;1130880;1130880 | 14592;14592;14592;14592;14592 | 297696;297248;296000;298464;295136 | |
216 | densenet1_stage3_relu22_fwd | Activation | [1,912,14,14] | 41.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 357504 | 1706.67 | 11178.67 | 51.00 | 27.75 | 82.51 | false | 0.507911;0.508949;0.511127;0.514865;0.508516 | 357504;357504;357504;357504;357504 | 5120;0;0;0;5120 | 11648;11008;12416;10880;10368 | |
217 | densenet1_stage3_conv22_fwd | Convolution | [1,912,14,14] | 3952 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 60.67 | 78484224 | 700416.00 | 11530.67 | 3.10 | 110.24 | 1293.69 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 78484224;78484224;78484224;78484224;78484224 | 700416;700416;700416;700416;700416 | 11744;11616;11232;10592;11872 | |
218 | densenet1_stage3_batchnorm23_fwd | BatchNorm | [1,192,14,14] | 63 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3125.33 | 39210.67 | 6.80 | 5.62 | 51.01 | true | 0.068078;0.068172;0.067893;0.068068;0.068147 | 238080;238080;238080;238080;238080 | 3072;8448;3232;3072;3072 | 39040;39168;40768;38912;39424 | |
219 | densenet1_stage3_relu23_fwd | Activation | [1,192,14,14] | 13.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 0.00 | 4650.67 | 31.60 | 16.18 | 18.82 | true | 0.316353;0.316702;0.315419;0.318251;0.316316 | 75264;75264;75264;75264;75264 | 4736;4608;4864;4608;4480 | 0;0;288;0;0 | |
220 | densenet1_stage3_conv23_fwd | Convolution | [1,192,14,14] | 2442.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 73600.00 | 12.50 | 357.62 | 529.95 | false | 0.124911;0.124911;0.124909;0.124913;0.124913 | 26320896;26320896;26320896;26320896;26320896 | 0;0;512;0;0 | 78464;74368;69120;75008;71424 | |
220 | densenet1_stage3_conv23_fwd | Convolution | [1,192,14,14] | 2442.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 39242.67 | 7.50 | 1.92 | 142.54 | true | 0.075264;0.075285;0.075248;0.075270;0.075203 | 712704;712704;712704;712704;712704 | 331776;331776;332032;331776;331776 | 36256;40480;40992;35488;42144 | |
221 | densenet1_stage3_concat11 | Concat | [1,912,14,14] | 58.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.86 | 0 | 0.00 | 117.33 | 34.50 | 0.00 | 0.00 | true | 0.567154;0.121795;0.567742;0.121906;0.566845;0.121920;0.569800;0.121836;0.568711;0.121836 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;6656;0;0;0 | 0;416;0;0;256;320;128;0;0;640 | |
221 | densenet1_stage3_concat11 | Concat | [1,912,14,14] | 58.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.71 | 0 | 0.00 | 117.33 | 34.50 | 0.00 | 0.00 | true | 0.567154;0.121795;0.567742;0.121906;0.566845;0.121920;0.569800;0.121836;0.568711;0.121836 | 0;0;0;0;0;0;0;0;0;0 | 0;416;0;0;256;320;128;0;0;640 | 0;0;0;0;0;0;6656;0;0;0 | |
222 | densenet1_stage3_batchnorm24_fwd | BatchNorm | [1,960,14,14] | 64.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 1190400 | 15360.00 | 567818.67 | 23.00 | 2.04 | 198.40 | true | 0.230045;0.228017;0.231259;0.229135;0.231344 | 1190400;1190400;1190400;1190400;1190400 | 568064;569984;567840;566784;567552 | 15360;15360;15360;15360;15360 | |
223 | densenet1_stage3_relu24_fwd | Activation | [1,960,14,14] | 43.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 376320 | 0.00 | 25376.00 | 50.50 | 14.83 | 75.26 | true | 0.503682;0.503255;0.506861;0.502495;0.507275 | 376320;376320;376320;376320;376320 | 0;0;256;0;0 | 25888;24864;27328;24736;25376 | |
224 | densenet1_stage3_conv24_fwd | Convolution | [1,960,14,14] | 4149 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 63.00 | 82612992 | 737280.00 | 122762.67 | 3.10 | 96.06 | 1311.32 | false | 0.031249;0.031248;0.031248;0.031248;0.031249 | 82612992;82612992;82612992;82612992;82612992 | 737280;737280;737536;737280;737280 | 122464;123488;122208;124896;122336 | |
225 | densenet1_stage3_batchnorm25_fwd | BatchNorm | [1,192,14,14] | 69.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 101749.33 | 6.80 | 2.27 | 47.62 | true | 0.067250;0.068256;0.068167;0.068206;0.068346 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 101792;101536;101920;101792;101664 | |
226 | densenet1_stage3_relu25_fwd | Activation | [1,192,14,14] | 17.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 0.00 | 11733.33 | 31.60 | 6.41 | 18.82 | true | 0.315283;0.315914;0.317201;0.317402;0.316154 | 75264;75264;75264;75264;75264 | 11776;12032;11520;11008;11904 | 0;0;224;0;0 | |
227 | densenet1_stage3_conv25_fwd | Convolution | [1,192,14,14] | 2481.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 276266.67 | 12.50 | 95.27 | 526.42 | false | 0.124931;0.124929;0.124930;0.124920;0.124922 | 26320896;26320896;26320896;26320896;26320896 | 5376;0;0;0;0 | 266368;279424;278656;281472;270720 | |
227 | densenet1_stage3_conv25_fwd | Convolution | [1,192,14,14] | 2481.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 268192.00 | 7.50 | 1.19 | 142.54 | true | 0.075281;0.075347;0.075284;0.075152;0.075207 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 265984;267232;268032;269312;275840 | |
228 | densenet1_stage3_concat12 | Concat | [1,960,14,14] | 56.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.86 | 0 | 0.00 | 280517.33 | 35.20 | 0.00 | 0.00 | true | 0.582712;0.121824;0.583683;0.121814;0.581634;0.121873;0.582978;0.121845;0.581917;0.121914 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;5120;0;0;0;0;0;0 | 534080;34688;532064;34944;531456;33792;533056;34304;515648;34304 | |
228 | densenet1_stage3_concat12 | Concat | [1,960,14,14] | 56.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.57 | 0 | 0.00 | 280517.33 | 35.20 | 0.00 | 0.00 | true | 0.582712;0.121824;0.583683;0.121814;0.581634;0.121873;0.582978;0.121845;0.581917;0.121914 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;5120;0;0;0;0;0;0 | 534080;34688;532064;34944;531456;33792;533056;34304;515648;34304 | |
229 | densenet1_stage3_batchnorm26_fwd | BatchNorm | [1,1008,14,14] | 66 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 1249920 | 16128.00 | 316896.00 | 23.10 | 3.75 | 208.32 | true | 0.231557;0.231065;0.231206;0.230757;0.239545 | 1249920;1249920;1249920;1249920;1249920 | 16128;16128;16128;16128;16128 | 317152;314848;318048;321184;315488 | |
230 | densenet1_stage3_relu26_fwd | Activation | [1,1008,14,14] | 47.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 395136 | 0.00 | 12800.00 | 51.20 | 30.87 | 79.03 | false | 0.509938;0.512146;0.510364;0.516397;0.514717 | 395136;395136;395136;395136;395136 | 0;1792;0;0;0 | 13312;11904;13440;12928;12160 | |
231 | densenet1_stage3_conv26_fwd | Convolution | [1,1008,14,14] | 4370.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.00 | 86741760 | 774144.00 | 12085.33 | 3.10 | 110.33 | 1314.27 | false | 0.031248;0.031248;0.031248;0.031248;0.031248 | 86741760;86741760;86741760;86741760;86741760 | 774144;774144;784704;774144;774144 | 12128;12128;12864;10720;12000 | |
232 | densenet1_stage3_batchnorm27_fwd | BatchNorm | [1,192,14,14] | 63.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 39424.00 | 6.80 | 5.60 | 47.62 | true | 0.067992;0.068160;0.068225;0.068254;0.068183 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 38784;39424;39296;39808;39552 | |
233 | densenet1_stage3_relu27_fwd | Activation | [1,192,14,14] | 17.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 0.00 | 4437.33 | 31.80 | 16.96 | 18.82 | true | 0.378025;0.318565;0.316168;0.316610;0.317576 | 75264;75264;75264;75264;75264 | 0;0;0;0;0 | 4736;4608;4480;3584;4224 | |
234 | densenet1_stage3_conv27_fwd | Convolution | [1,192,14,14] | 2457 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 75189.33 | 12.50 | 350.06 | 526.42 | false | 0.124910;0.124919;0.124921;0.124908;0.124911 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;2048;0 | 76544;75936;73088;67840;80640 | |
234 | densenet1_stage3_conv27_fwd | Convolution | [1,192,14,14] | 2457 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 50656.00 | 7.50 | 1.86 | 142.54 | true | 0.075246;0.075280;0.075245;0.075273;0.075261 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 46624;48288;51488;52192;52512 | |
235 | densenet1_stage3_concat13 | Concat | [1,1008,14,14] | 64.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.43 | 0 | 0.00 | 14976.00 | 35.80 | 0.00 | 0.00 | true | 0.595237;0.121854;0.595379;0.121836;0.594715;0.121841;0.594610;0.121827;0.594570;0.121798 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 0;29824;128;30208;0;30080;0;29824;0;30080 | |
235 | densenet1_stage3_concat13 | Concat | [1,1008,14,14] | 64.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.14 | 0 | 0.00 | 14976.00 | 35.80 | 0.00 | 0.00 | true | 0.595237;0.121854;0.595379;0.121836;0.594715;0.121841;0.594610;0.121827;0.594570;0.121798 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 0;29824;128;30208;0;30080;0;29824;0;30080 | |
236 | densenet1_stage3_batchnorm28_fwd | BatchNorm | [1,1056,14,14] | 71.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 1309440 | 16896.00 | 755968.00 | 24.30 | 1.69 | 206.76 | true | 0.243656;0.245088;0.243956;0.242154;0.237625 | 1309440;1309440;1309440;1309440;1309440 | 757376;754400;756128;758112;753152 | 16896;16896;16896;16896;16896 | |
237 | densenet1_stage3_relu28_fwd | Activation | [1,1056,14,14] | 68.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 413952 | 0.00 | 33408.00 | 51.80 | 12.39 | 103.49 | true | 0.519296;0.522402;0.519735;0.513848;0.516050 | 413952;413952;413952;413952;413952 | 0;0;0;0;512 | 33056;33088;35776;31136;34080 | |
238 | densenet1_stage3_conv28_fwd | Convolution | [1,1056,14,14] | 4541.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 68.33 | 90870528 | 811008.00 | 123530.67 | 3.10 | 97.24 | 1329.82 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 90870528;90870528;90870528;90870528;90870528 | 811008;811008;811008;811008;811008 | 124256;123744;122208;124256;122592 | |
239 | densenet1_stage3_batchnorm29_fwd | BatchNorm | [1,192,14,14] | 65.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 102517.33 | 6.80 | 2.25 | 51.01 | true | 0.068069;0.068001;0.067931;0.067837;0.068180 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 102432;102688;103328;102176;102432 | |
240 | densenet1_stage3_relu29_fwd | Activation | [1,192,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 74.67 | 11264.00 | 31.70 | 6.64 | 18.82 | true | 0.315597;0.316021;0.318178;0.318781;0.316994 | 75264;75264;75264;75264;75264 | 2272;0;224;0;0 | 10624;11904;11136;11008;11648 | |
241 | densenet1_stage3_conv29_fwd | Convolution | [1,192,14,14] | 2448.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.33 | 26320896 | 0.00 | 324394.67 | 12.50 | 81.14 | 533.54 | false | 0.124925;0.124927;0.124931;0.124921;0.124923 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;2048 | 317184;340864;338432;317568;308864 | |
241 | densenet1_stage3_conv29_fwd | Convolution | [1,192,14,14] | 2448.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 327552.00 | 7.50 | 1.08 | 142.54 | true | 0.075164;0.075476;0.075300;0.075219;0.075140 | 712704;712704;712704;712704;712704 | 333920;310048;312736;336000;337120 | 331776;331776;331776;331776;331776 | |
242 | densenet1_stage3_concat14 | Concat | [1,1056,14,14] | 62.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.00 | 0 | 0.00 | 336826.67 | 36.30 | 0.00 | 0.00 | true | 0.604641;0.121982;0.605307;0.121824;0.602158;0.121865;0.604388;0.121847;0.603757;0.121931 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;256;0;0;0;0;0;0 | 638336;36608;681728;36864;677184;36352;639456;36864;631808;37632 | |
242 | densenet1_stage3_concat14 | Concat | [1,1056,14,14] | 62.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.71 | 0 | 0.00 | 336826.67 | 36.30 | 0.00 | 0.00 | true | 0.604641;0.121982;0.605307;0.121824;0.602158;0.121865;0.604388;0.121847;0.603757;0.121931 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;256;0;0;0;0;0;0 | 638336;36608;681728;36864;677184;36352;639456;36864;631808;37632 | |
243 | densenet1_stage3_batchnorm30_fwd | BatchNorm | [1,1104,14,14] | 68.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1368960 | 17664.00 | 450485.33 | 24.80 | 2.92 | 195.57 | true | 0.245544;0.247876;0.249524;0.248850;0.248091 | 1368960;1368960;1368960;1368960;1368960 | 17664;17664;17664;17664;17664 | 452288;454880;451648;447520;445792 | |
244 | densenet1_stage3_relu30_fwd | Activation | [1,1104,14,14] | 49.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 432768 | 0.00 | 19712.00 | 52.60 | 21.95 | 86.55 | false | 0.527019;0.524259;0.527512;0.528861;0.524846 | 432768;432768;432768;432768;432768 | 17024;19328;20480;20864;19328 | 0;0;0;0;0 | |
245 | densenet1_stage3_conv30_fwd | Convolution | [1,1104,14,14] | 4759.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 71.00 | 94999296 | 847872.00 | 26720.00 | 3.10 | 108.62 | 1338.02 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 94999296;94999296;94999296;94999296;94999296 | 847872;847872;847872;847872;849920 | 26080;27488;26976;26720;26464 | |
246 | densenet1_stage3_batchnorm31_fwd | BatchNorm | [1,192,14,14] | 60.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 238080 | 3072.00 | 53034.67 | 6.80 | 4.24 | 54.95 | true | 0.068151;0.067855;0.067682;0.067971;0.068119 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 52992;52992;53120;52992;53376 | |
247 | densenet1_stage3_relu31_fwd | Activation | [1,192,14,14] | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 149.33 | 6016.00 | 33.20 | 12.21 | 18.82 | true | 0.337744;0.340196;0.316584;0.317623;0.339997 | 75264;75264;75264;75264;75264 | 6144;6144;5888;5888;6016 | 512;224;0;0;224 | |
248 | densenet1_stage3_conv31_fwd | Convolution | [1,192,14,14] | 2445.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.00 | 26320896 | 0.00 | 153088.00 | 12.50 | 171.93 | 537.16 | false | 0.124917;0.124912;0.124920;0.124914;0.124912 | 26320896;26320896;26320896;26320896;26320896 | 144256;147328;142336;179072;167680 | 0;0;0;0;0 | |
248 | densenet1_stage3_conv31_fwd | Convolution | [1,192,14,14] | 2445.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 712704 | 331776.00 | 106954.67 | 7.50 | 1.62 | 152.71 | true | 0.075092;0.075258;0.075124;0.075017;0.075282 | 712704;712704;712704;712704;712704 | 334336;331776;331776;331776;331776 | 129312;104992;105504;96288;110368 | |
249 | densenet1_stage3_concat15 | Concat | [1,1104,14,14] | 63.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.86 | 0 | 0.00 | 15333.33 | 36.70 | 0.00 | 0.00 | true | 0.611682;0.121845;0.612049;0.122033;0.612313;0.121846;0.612800;0.121864;0.613740;0.121949 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 256;30496;256;30368;256;30240;384;30496;256;30624 | |
249 | densenet1_stage3_concat15 | Concat | [1,1104,14,14] | 63.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.57 | 0 | 0.00 | 15333.33 | 36.70 | 0.00 | 0.00 | true | 0.611682;0.121845;0.612049;0.122033;0.612313;0.121846;0.612800;0.121864;0.613740;0.121949 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 256;30496;256;30368;256;30240;384;30496;256;30624 | |
250 | densenet1_stage3_batchnorm32_fwd | BatchNorm | [1,1152,14,14] | 72 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 1428480 | 18432.00 | 865898.67 | 25.50 | 1.62 | 238.08 | true | 0.254078;0.255592;0.255301;0.257435;0.252209 | 1428480;1428480;1428480;1428480;1428480 | 18432;18432;25088;18432;18432 | 864544;865760;868032;860832;867392 | |
251 | densenet1_stage3_relu32_fwd | Activation | [1,1152,14,14] | 52.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 451584 | 0.00 | 33312.00 | 53.30 | 13.56 | 90.32 | true | 0.537576;0.533515;0.530604;0.533236;0.533147 | 451584;451584;451584;451584;451584 | 0;0;3584;0;0 | 31520;34848;29888;34464;33952 | |
252 | densenet1_stage3_conv32_fwd | Convolution | [1,1152,14,14] | 4946.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 74.67 | 99128064 | 884736.00 | 130581.33 | 3.10 | 97.63 | 1327.60 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 99128064;99128064;99128064;99128064;99128064 | 132576;128992;130656;130528;130560 | 884736;884736;884736;884736;891392 | |
253 | densenet1_stage3_batchnorm33_fwd | BatchNorm | [1,192,14,14] | 64.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 102474.67 | 6.80 | 2.26 | 51.01 | true | 0.068074;0.068285;0.068858;0.068032;0.068181 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 102688;102304;102304;102432;102816 | |
254 | densenet1_stage3_relu33_fwd | Activation | [1,192,14,14] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 75264 | 74.67 | 11690.67 | 32.40 | 6.40 | 17.37 | true | 0.317880;0.316429;0.339568;0.337768;0.315553 | 75264;75264;75264;75264;75264 | 224;0;224;0;0 | 11648;12032;11904;10624;11520 | |
255 | densenet1_stage3_conv33_fwd | Convolution | [1,192,14,14] | 2439 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 319402.67 | 12.50 | 82.41 | 529.95 | false | 0.124931;0.124927;0.124923;0.124922;0.124928 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 320256;326912;334976;297728;311040 | |
255 | densenet1_stage3_conv33_fwd | Convolution | [1,192,14,14] | 2439 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 336544.00 | 7.50 | 1.07 | 142.54 | true | 0.075057;0.075215;0.075179;0.075254;0.075116 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 335616;328960;320896;358464;345056 | |
256 | densenet1_stage3_concat16 | Concat | [1,1152,14,14] | 60 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.14 | 0 | 0.00 | 371610.67 | 37.10 | 0.00 | 0.00 | true | 0.620406;0.121819;0.621262;0.121839;0.622810;0.121914;0.621308;0.121846;0.624260;0.121848 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 716544;34720;730560;35040;745152;34720;706624;35360;700000;36096 | |
256 | densenet1_stage3_concat16 | Concat | [1,1152,14,14] | 60 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.71 | 0 | 0.00 | 371610.67 | 37.10 | 0.00 | 0.00 | true | 0.620406;0.121819;0.621262;0.121839;0.622810;0.121914;0.621308;0.121846;0.624260;0.121848 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;0;0 | 716544;34720;730560;35040;745152;34720;706624;35360;700000;36096 | |
257 | densenet1_stage3_batchnorm34_fwd | BatchNorm | [1,1200,14,14] | 66.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1488000 | 19200.00 | 603829.33 | 26.60 | 2.39 | 212.57 | true | 0.263121;0.272456;0.266500;0.264397;0.266106 | 1488000;1488000;1488000;1488000;1488000 | 19200;19200;19200;19200;19200 | 602816;603424;605248;606496;602368 | |
258 | densenet1_stage3_relu34_fwd | Activation | [1,1200,14,14] | 54.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 470400 | 0.00 | 19754.67 | 54.40 | 23.81 | 108.56 | false | 0.543160;0.544418;0.545143;0.548382;0.543242 | 470400;470400;470400;470400;470400 | 0;256;0;0;0 | 18304;16000;20480;20608;20480 | |
259 | densenet1_stage3_conv34_fwd | Convolution | [1,1200,14,14] | 5146.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 77.33 | 103256832 | 921600.00 | 102080.00 | 3.10 | 100.87 | 1335.22 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 103256832;103256832;103256832;103256832;103256832 | 921600;921600;921600;921600;921600 | 102112;101856;101984;102144;102240 | |
260 | densenet1_stage3_batchnorm35_fwd | BatchNorm | [1,192,14,14] | 67.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 53834.67 | 6.80 | 4.18 | 47.62 | true | 0.068136;0.068238;0.068066;0.068176;0.068743 | 238080;238080;238080;238080;238080 | 5120;3072;3072;3072;3072 | 53792;54304;53792;53280;53920 | |
261 | densenet1_stage3_relu35_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 6101.33 | 33.10 | 11.90 | 18.82 | true | 0.336997;0.339054;0.316779;0.317405;0.338978 | 75264;75264;75264;75264;75264 | 224;224;224;0;224 | 6144;5888;6400;5888;6272 | |
262 | densenet1_stage3_conv35_fwd | Convolution | [1,192,14,14] | 2456.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 226176.00 | 12.50 | 116.37 | 526.42 | false | 0.124915;0.124917;0.124915;0.124911;0.124917 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 229248;222592;211072;226688;240896 | |
262 | densenet1_stage3_conv35_fwd | Convolution | [1,192,14,14] | 2456.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 712704 | 331776.00 | 191648.00 | 7.50 | 1.36 | 152.71 | true | 0.075282;0.075171;0.075183;0.075172;0.075283 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 193184;193056;191392;187040;190496 | |
263 | densenet1_stage3_concat17 | Concat | [1,1200,14,14] | 66.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.86 | 0 | 0.00 | 15205.33 | 37.50 | 0.00 | 0.00 | true | 0.626846;0.121848;0.628339;0.121848;0.628655;0.121838;0.627873;0.121804;0.631657;0.121797 | 0;0;0;0;0;0;0;0;0;0 | 256;30432;384;30240;256;30400;256;29984;256;30112 | 0;0;512;0;0;0;1024;0;0;0 | |
263 | densenet1_stage3_concat17 | Concat | [1,1200,14,14] | 66.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.43 | 0 | 0.00 | 15205.33 | 37.50 | 0.00 | 0.00 | true | 0.626846;0.121848;0.628339;0.121848;0.628655;0.121838;0.627873;0.121804;0.631657;0.121797 | 0;0;0;0;0;0;0;0;0;0 | 256;30432;384;30240;256;30400;256;29984;256;30112 | 0;0;512;0;0;0;1024;0;0;0 | |
264 | densenet1_stage3_batchnorm36_fwd | BatchNorm | [1,1248,14,14] | 80.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1547520 | 20096.00 | 938624.00 | 27.20 | 1.61 | 221.07 | true | 0.270059;0.267599;0.273833;0.275221;0.272961 | 1547520;1547520;1547520;1547520;1547520 | 20096;20096;20096;20032;20096 | 941760;936128;937664;938592;939616 | |
265 | densenet1_stage3_relu36_fwd | Activation | [1,1248,14,14] | 55.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 489216 | 0.00 | 35509.33 | 54.80 | 13.78 | 104.82 | true | 0.550819;0.547519;0.550057;0.547510;0.547553 | 489216;489216;489216;489216;489216 | 0;0;0;0;0 | 34912;35360;35296;35872;37216 | |
266 | densenet1_stage3_conv36_fwd | Convolution | [1,1248,14,14] | 5343.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 80.00 | 107385600 | 958464.00 | 131786.67 | 3.10 | 98.50 | 1342.32 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 107385600;107385600;107385600;107385600;107385600 | 958464;958464;958464;958464;958464 | 130528;134112;133088;131744;129760 | |
267 | densenet1_stage3_batchnorm37_fwd | BatchNorm | [1,192,14,14] | 66.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 101952.00 | 6.80 | 2.27 | 51.01 | true | 0.068150;0.068143;0.068070;0.068044;0.068203 | 238080;238080;238080;238080;238080 | 3072;5632;3072;3072;3072 | 102176;102624;101152;101504;102176 | |
268 | densenet1_stage3_relu37_fwd | Activation | [1,192,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 75264 | 74.67 | 11946.67 | 33.10 | 6.26 | 17.37 | true | 0.388237;0.315629;0.337641;0.337698;0.318221 | 75264;75264;75264;75264;75264 | 480;0;224;0;0 | 11904;11776;13312;11680;12160 | |
269 | densenet1_stage3_conv37_fwd | Convolution | [1,192,14,14] | 2448.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.00 | 26320896 | 0.00 | 319104.00 | 12.50 | 82.48 | 537.16 | false | 0.124930;0.124931;0.124932;0.124932;0.124920 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 318336;319552;309504;322176;319424 | |
269 | densenet1_stage3_conv37_fwd | Convolution | [1,192,14,14] | 2448.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 336800.00 | 7.50 | 1.07 | 142.54 | true | 0.075071;0.075154;0.075290;0.075218;0.075275 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 337536;336480;341216;336384;336384 | |
270 | densenet1_stage3_concat18 | Concat | [1,1248,14,14] | 61.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.86 | 0 | 0.00 | 413658.67 | 37.80 | 0.00 | 0.00 | true | 0.632887;0.121799;0.636299;0.121910;0.634582;0.121896;0.635050;0.121890;0.638790;0.121799 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;2048;128;0;0;0 | 785376;39008;788608;38240;788384;41568;795008;39008;789664;37856 | |
270 | densenet1_stage3_concat18 | Concat | [1,1248,14,14] | 61.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.43 | 0 | 0.00 | 413658.67 | 37.80 | 0.00 | 0.00 | true | 0.632887;0.121799;0.636299;0.121910;0.634582;0.121896;0.635050;0.121890;0.638790;0.121799 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;2048;128;0;0;0 | 785376;39008;788608;38240;788384;41568;795008;39008;789664;37856 | |
271 | densenet1_stage3_batchnorm38_fwd | BatchNorm | [1,1296,14,14] | 69 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1607040 | 20736.00 | 704672.00 | 28.00 | 2.22 | 229.58 | true | 0.276617;0.282545;0.279847;0.278880;0.280395 | 1607040;1607040;1607040;1607040;1607040 | 20736;20736;29952;20736;20736 | 704416;705920;701856;705216;704384 | |
272 | densenet1_stage3_relu38_fwd | Activation | [1,1296,14,14] | 57 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 508032 | 0.00 | 20693.33 | 55.60 | 24.55 | 101.61 | false | 0.556913;0.554707;0.559899;0.556324;0.555662 | 508032;508032;508032;508032;508032 | 0;1792;0;0;0 | 22656;19840;19968;22144;19968 | |
273 | densenet1_stage3_conv38_fwd | Convolution | [1,1296,14,14] | 5560 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 83.00 | 111514368 | 995328.00 | 126602.67 | 3.10 | 99.40 | 1343.55 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 111514368;111514368;111514368;111514368;111514368 | 995328;995328;995328;995328;1000704 | 126944;126560;126688;126560;126176 | |
274 | densenet1_stage3_batchnorm39_fwd | BatchNorm | [1,192,14,14] | 56.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 131701.33 | 6.80 | 1.77 | 47.62 | true | 0.068293;0.068242;0.068162;0.068099;0.068068 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 132256;131360;131872;131872;131360 | |
275 | densenet1_stage3_relu39_fwd | Activation | [1,192,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 75264 | 224.00 | 14976.00 | 34.20 | 4.95 | 17.37 | true | 0.347430;0.339864;0.362020;0.339044;0.339494 | 75264;75264;75264;75264;75264 | 224;224;224;0;224 | 14848;14976;14976;14976;15104 | |
276 | densenet1_stage3_conv39_fwd | Convolution | [1,192,14,14] | 2461 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.00 | 26320896 | 0.00 | 267776.00 | 12.50 | 98.29 | 537.16 | false | 0.124920;0.124913;0.124914;0.124910;0.124917 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 269248;269568;264512;257536;273408 | |
276 | densenet1_stage3_conv39_fwd | Convolution | [1,192,14,14] | 2461 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 238794.67 | 7.50 | 1.25 | 142.54 | true | 0.075094;0.075229;0.075211;0.075270;0.075257 | 712704;712704;712704;712704;712704 | 331776;331776;331776;332032;331776 | 238240;236448;241696;247584;232992 | |
277 | densenet1_stage3_concat19 | Concat | [1,1296,14,14] | 69.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 0.00 | 17968.00 | 38.20 | 0.00 | 0.00 | true | 0.644369;0.121928;0.643122;0.121847;0.642620;0.121781;0.644228;0.121857;0.643205;0.121850 | 0;0;0;0;0;0;0;0;0;0 | 5536;30208;5920;30144;5536;30336;3104;30080;5920;30528 | 0;0;0;0;0;0;0;0;0;0 | |
277 | densenet1_stage3_concat19 | Concat | [1,1296,14,14] | 69.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.86 | 0 | 0.00 | 17968.00 | 38.20 | 0.00 | 0.00 | true | 0.644369;0.121928;0.643122;0.121847;0.642620;0.121781;0.644228;0.121857;0.643205;0.121850 | 0;0;0;0;0;0;0;0;0;0 | 5536;30208;5920;30144;5536;30336;3104;30080;5920;30528 | 0;0;0;0;0;0;0;0;0;0 | |
278 | densenet1_stage3_batchnorm40_fwd | BatchNorm | [1,1344,14,14] | 80.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1666560 | 21632.00 | 1009909.33 | 28.80 | 1.62 | 238.08 | true | 0.286230;0.286292;0.289293;0.288760;0.288238 | 1666560;1666560;1666560;1666560;1666560 | 1011968;1011648;1006112;1014432;1002688 | 21632;21632;21632;21600;21632 | |
279 | densenet1_stage3_relu40_fwd | Activation | [1,1344,14,14] | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 526848 | 0.00 | 40352.00 | 56.60 | 13.06 | 105.37 | true | 0.565489;0.563080;0.565378;0.565914;0.566147 | 526848;526848;526848;526848;526848 | 0;256;0;0;0 | 40416;38880;42080;39456;41184 | |
280 | densenet1_stage3_conv40_fwd | Convolution | [1,1344,14,14] | 5735 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 87.00 | 115643136 | 1032192.00 | 155413.33 | 3.10 | 97.38 | 1329.23 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 115643136;115643136;115643136;115643136;115643136 | 154848;155904;155488;154784;157408 | 1034240;1032192;1032192;1032192;1032192 | |
281 | densenet1_stage3_batchnorm41_fwd | BatchNorm | [1,192,14,14] | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 238080 | 3072.00 | 108618.67 | 6.80 | 2.13 | 54.95 | true | 0.068795;0.067961;0.068042;0.068184;0.068074 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 108704;108448;108704;108320;108960 | |
282 | densenet1_stage3_relu41_fwd | Activation | [1,192,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 12330.67 | 33.00 | 5.99 | 18.82 | true | 0.337770;0.315493;0.337220;0.340631;0.314529 | 75264;75264;75264;75264;75264 | 224;224;224;0;224 | 12544;13440;12160;12288;12160 | |
283 | densenet1_stage3_conv41_fwd | Convolution | [1,192,14,14] | 2443.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 324053.33 | 12.50 | 81.22 | 526.42 | false | 0.124922;0.124925;0.124931;0.124932;0.124932 | 26320896;26320896;26320896;26320896;26320896 | 332736;331072;329152;311936;305984 | 0;0;0;0;0 | |
283 | densenet1_stage3_conv41_fwd | Convolution | [1,192,14,14] | 2443.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 330698.67 | 7.50 | 1.08 | 142.54 | true | 0.075238;0.075216;0.075452;0.075130;0.075250 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 323584;324704;327040;346752;340352 | |
284 | densenet1_stage3_concat20 | Concat | [1,1344,14,14] | 61.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.14 | 0 | 2240.00 | 455557.33 | 38.10 | 0.00 | 0.00 | true | 0.639093;0.121840;0.644401;0.121838;0.639786;0.121866;0.641304;0.121874;0.643278;0.121960 | 0;0;0;0;0;0;0;0;0;0 | 6144;0;5248;0;5120;0;3200;0;5120;0 | 883392;44512;878080;43232;875008;45024;869120;43232;856192;43488 | |
284 | densenet1_stage3_concat20 | Concat | [1,1344,14,14] | 61.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.57 | 0 | 2240.00 | 455557.33 | 38.10 | 0.00 | 0.00 | true | 0.639093;0.121840;0.644401;0.121838;0.639786;0.121866;0.641304;0.121874;0.643278;0.121960 | 0;0;0;0;0;0;0;0;0;0 | 6144;0;5248;0;5120;0;3200;0;5120;0 | 883392;44512;878080;43232;875008;45024;869120;43232;856192;43488 | |
285 | densenet1_stage3_batchnorm42_fwd | BatchNorm | [1,1392,14,14] | 72.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.33 | 1726080 | 22272.00 | 839936.00 | 28.90 | 2.00 | 235.39 | true | 0.290576;0.288351;0.283664;0.292532;0.287205 | 1726080;1726080;1726080;1726080;1726080 | 22272;22272;22272;22272;22272 | 839712;842848;838176;841920;836832 | |
286 | densenet1_stage3_relu42_fwd | Activation | [1,1392,14,14] | 60.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 545664 | 0.00 | 22741.33 | 57.30 | 23.99 | 116.92 | false | 0.572726;0.571614;0.574547;0.571724;0.573992 | 545664;545664;545664;545664;545664 | 0;0;0;0;0 | 21760;22528;22656;23808;23040 | |
287 | densenet1_stage3_conv42_fwd | Convolution | [1,1392,14,14] | 5967.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 90.00 | 119771904 | 1069056.00 | 127285.33 | 3.10 | 100.12 | 1330.80 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 119771904;119771904;119771904;119771904;119771904 | 127200;127328;127328;126944;127968 | 1069056;1069056;1069056;1069056;1069056 | |
288 | densenet1_stage3_batchnorm43_fwd | BatchNorm | [1,192,14,14] | 60 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 132384.00 | 6.80 | 1.76 | 47.62 | true | 0.068183;0.068192;0.068160;0.068098;0.068239 | 238080;238080;238080;238080;238080 | 3328;3072;3072;3072;3072 | 133408;132384;132512;132256;131744 | |
289 | densenet1_stage3_relu43_fwd | Activation | [1,192,14,14] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 15658.67 | 33.90 | 4.74 | 18.82 | true | 0.338462;0.342932;0.336294;0.338011;0.340518 | 75264;75264;75264;75264;75264 | 224;224;224;0;224 | 14592;15744;16000;15232;16128 | |
290 | densenet1_stage3_conv43_fwd | Convolution | [1,192,14,14] | 2448 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 295082.67 | 12.50 | 89.20 | 529.95 | false | 0.124914;0.124910;0.124922;0.124909;0.124918 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 292928;290528;297920;294400;315712 | |
290 | densenet1_stage3_conv43_fwd | Convolution | [1,192,14,14] | 2448 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 254794.67 | 7.50 | 1.22 | 142.54 | true | 0.075154;0.075219;0.075226;0.075286;0.075086 | 712704;712704;712704;712704;712704 | 254112;257952;252320;262304;245792 | 331776;331776;336896;331776;331776 | |
291 | densenet1_stage3_concat21 | Concat | [1,1392,14,14] | 70.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 0.00 | 19402.67 | 38.80 | 0.00 | 0.00 | true | 0.656183;0.121872;0.654839;0.121991;0.654990;0.121932;0.655850;0.121867;0.655065;0.121894 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;7168;6144;0;0;0;0 | 8608;29984;8736;30240;8608;32032;2976;30240;8608;31264 | |
291 | densenet1_stage3_concat21 | Concat | [1,1392,14,14] | 70.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.86 | 0 | 0.00 | 19402.67 | 38.80 | 0.00 | 0.00 | true | 0.656183;0.121872;0.654839;0.121991;0.654990;0.121932;0.655850;0.121867;0.655065;0.121894 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;7168;6144;0;0;0;0 | 8608;29984;8736;30240;8608;32032;2976;30240;8608;31264 | |
292 | densenet1_stage3_batchnorm44_fwd | BatchNorm | [1,1440,14,14] | 82.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 1785600 | 23146.67 | 1083296.00 | 29.30 | 1.61 | 255.09 | true | 0.292222;0.288309;0.291736;0.294610;0.299670 | 1785600;1785600;1785600;1785600;1785600 | 1069600;1082496;1083424;1083968;1085152 | 23104;23168;23168;23104;23168 | |
293 | densenet1_stage3_relu44_fwd | Activation | [1,1440,14,14] | 62 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 564480 | 0.00 | 54442.67 | 58.00 | 10.37 | 112.90 | true | 0.581334;0.576791;0.580808;0.577690;0.582152 | 564480;564480;564480;564480;564480 | 56576;54688;52544;56096;51552 | 0;0;0;0;256 | |
294 | densenet1_stage3_conv44_fwd | Convolution | [1,1440,14,14] | 6155.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 92.67 | 123900672 | 1105920.00 | 154890.67 | 3.10 | 98.27 | 1337.05 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 123900672;123900672;123900672;123900672;123900672 | 1105920;1105920;1105920;1105920;1105920 | 156832;155744;154720;153632;154208 | |
295 | densenet1_stage3_batchnorm45_fwd | BatchNorm | [1,192,14,14] | 60 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 116714.67 | 6.80 | 1.99 | 47.62 | true | 0.068196;0.067902;0.068053;0.068183;0.068270 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 115872;117984;116384;116352;117408 | |
296 | densenet1_stage3_relu45_fwd | Activation | [1,192,14,14] | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 20576.00 | 33.00 | 3.62 | 18.82 | true | 0.337210;0.317002;0.337049;0.340051;0.315929 | 75264;75264;75264;75264;75264 | 224;224;224;0;224 | 21120;20096;21760;20512;20096 | |
297 | densenet1_stage3_conv45_fwd | Convolution | [1,192,14,14] | 2459.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.00 | 26320896 | 0.00 | 317664.00 | 12.50 | 82.86 | 537.16 | false | 0.124924;0.124927;0.124924;0.124919;0.124926 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 297952;308960;321376;329504;322656 | |
297 | densenet1_stage3_conv45_fwd | Convolution | [1,192,14,14] | 2459.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 338336.00 | 7.50 | 1.06 | 142.54 | true | 0.075250;0.075266;0.075382;0.075294;0.075395 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 358368;346624;334848;328704;333536 | |
298 | densenet1_stage3_concat22 | Concat | [1,1440,14,14] | 65.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.71 | 0 | 68736.00 | 474197.33 | 39.20 | 0.00 | 0.00 | true | 0.662886;0.121902;0.664176;0.121881;0.662786;0.121903;0.662194;0.121895;0.664420;0.122020 | 0;0;0;0;0;0;0;0;0;0 | 886592;40800;906720;40288;928576;41696;943296;40160;932448;40800 | 140544;0;137344;0;139776;0;135296;0;144384;0 | |
298 | densenet1_stage3_concat22 | Concat | [1,1440,14,14] | 65.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 68736.00 | 474197.33 | 39.20 | 0.00 | 0.00 | true | 0.662886;0.121902;0.664176;0.121881;0.662786;0.121903;0.662194;0.121895;0.664420;0.122020 | 0;0;0;0;0;0;0;0;0;0 | 140544;0;137344;0;139776;0;135296;0;144384;0 | 886592;40800;906720;40288;928576;41696;943296;40160;932448;40800 | |
299 | densenet1_stage3_batchnorm46_fwd | BatchNorm | [1,1488,14,14] | 74.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.33 | 1845120 | 23808.00 | 931157.33 | 30.10 | 1.93 | 251.62 | true | 0.302356;0.303510;0.297862;0.303715;0.297863 | 1845120;1845120;1845120;1845120;1845120 | 23808;29184;23808;23808;23808 | 922912;930656;931104;938944;931712 | |
300 | densenet1_stage3_relu46_fwd | Activation | [1,1488,14,14] | 64.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 583296 | 0.00 | 35029.33 | 58.70 | 16.65 | 116.66 | true | 0.587252;0.585399;0.588109;0.588204;0.585142 | 583296;583296;583296;583296;583296 | 34560;37504;35712;31616;34816 | 2048;0;0;0;0 | |
301 | densenet1_stage3_conv46_fwd | Convolution | [1,1488,14,14] | 6371.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 95.00 | 128029440 | 1142784.00 | 127413.33 | 3.10 | 100.79 | 1347.68 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 128029440;128029440;128029440;128029440;128029440 | 1142784;1142784;1142784;1142784;1145088 | 127968;127456;127200;126560;127584 | |
302 | densenet1_stage3_batchnorm47_fwd | BatchNorm | [1,192,14,14] | 59.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 238080 | 3072.00 | 133024.00 | 6.80 | 1.75 | 54.95 | true | 0.068041;0.068130;0.066252;0.067888;0.067945 | 238080;238080;238080;238080;238080 | 133792;132768;133216;133088;132384 | 3072;3072;3072;3072;3072 | |
303 | densenet1_stage3_relu47_fwd | Activation | [1,192,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 75264 | 224.00 | 15274.67 | 33.90 | 4.86 | 17.37 | true | 0.338293;0.340276;0.359199;0.338198;0.338949 | 75264;75264;75264;75264;75264 | 224;224;224;0;224 | 14592;15360;15360;15104;15616 | |
304 | densenet1_stage3_conv47_fwd | Convolution | [1,192,14,14] | 2455.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 316010.67 | 12.50 | 83.29 | 529.95 | false | 0.124904;0.124915;0.124912;0.124914;0.124911 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;1792;0 | 314240;324320;317536;298400;316256 | |
304 | densenet1_stage3_conv47_fwd | Convolution | [1,192,14,14] | 2455.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 712704 | 331776.00 | 280437.33 | 7.50 | 1.16 | 152.71 | true | 0.075061;0.075270;0.075331;0.075343;0.075166 | 712704;712704;712704;712704;712704 | 304160;283552;270752;279072;278688 | 331776;331776;331776;331776;331776 | |
305 | densenet1_stage3_concat23 | Concat | [1,1488,14,14] | 74.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 6400.00 | 20176.00 | 39.40 | 0.00 | 0.00 | true | 0.666238;0.121925;0.667871;0.121928;0.666964;0.121825;0.665735;0.121852;0.666390;0.121905 | 0;0;0;0;0;0;0;0;0;0 | 13312;0;12672;0;15488;0;16896;0;12416;0 | 10048;30976;10496;30208;9888;30336;5120;30080;9184;31104 | |
305 | densenet1_stage3_concat23 | Concat | [1,1488,14,14] | 74.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.71 | 0 | 6400.00 | 20176.00 | 39.40 | 0.00 | 0.00 | true | 0.666238;0.121925;0.667871;0.121928;0.666964;0.121825;0.665735;0.121852;0.666390;0.121905 | 0;0;0;0;0;0;0;0;0;0 | 10048;30976;10496;30208;9888;30336;5120;30080;9184;31104 | 13312;0;12672;0;15488;0;16896;0;12416;0 | |
306 | densenet1_stage3_batchnorm48_fwd | BatchNorm | [1,1536,14,14] | 88.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 1904640 | 24725.33 | 1154400.00 | 30.60 | 1.62 | 238.08 | true | 0.306807;0.303728;0.302956;0.308358;0.310150 | 1904640;1904640;1904640;1904640;1904640 | 24736;24768;24736;24640;24704 | 1154976;1152896;1157248;1150080;1155328 | |
307 | densenet1_stage3_relu48_fwd | Activation | [1,1536,14,14] | 65.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 602112 | 0.00 | 62944.00 | 59.20 | 9.57 | 120.42 | true | 0.591984;0.592154;0.590835;0.591871;0.591255 | 602112;602112;602112;602112;602112 | 0;0;0;0;0 | 62816;63008;63200;63008;61088 | |
308 | densenet1_stage3_conv48_fwd | Convolution | [1,1536,14,14] | 6562.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 98.33 | 132158208 | 1179648.00 | 155701.33 | 3.10 | 98.97 | 1343.99 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 132158208;132158208;132158208;132158208;132158208 | 1179648;1179648;1179648;1179648;1179648 | 154848;155616;155616;156064;155872 | |
309 | densenet1_stage3_batchnorm49_fwd | BatchNorm | [1,192,14,14] | 59.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 129440.00 | 6.80 | 1.80 | 51.01 | true | 0.068159;0.068262;0.068144;0.068099;0.068139 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 128800;129696;129824;128384;130592 | |
310 | densenet1_stage3_relu49_fwd | Activation | [1,192,14,14] | 15.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 19285.33 | 33.10 | 3.86 | 18.82 | true | 0.337914;0.315979;0.346507;0.338265;0.315603 | 75264;75264;75264;75264;75264 | 224;224;224;0;224 | 19712;19456;18176;19872;18688 | |
311 | densenet1_stage3_conv49_fwd | Convolution | [1,192,14,14] | 2497.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 330880.00 | 12.50 | 79.55 | 526.42 | false | 0.124930;0.124931;0.124920;0.124924;0.124931 | 26320896;26320896;26320896;26320896;26320896 | 0;0;6656;0;0 | 328320;323104;332512;331808;333152 | |
311 | densenet1_stage3_conv49_fwd | Convolution | [1,192,14,14] | 2497.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 712704 | 331776.00 | 350730.67 | 7.50 | 1.04 | 152.71 | true | 0.075147;0.075209;0.075237;0.075403;0.075334 | 712704;712704;712704;712704;712704 | 331776;337152;331776;331776;331776 | 352320;357344;348320;351552;347680 | |
312 | densenet1_stage3_concat24 | Concat | [1,1536,14,14] | 71.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.43 | 0 | 106453.33 | 529392.00 | 39.50 | 0.00 | 0.00 | true | 0.667999;0.121843;0.668612;0.121809;0.670500;0.121874;0.670566;0.121904;0.669329;0.121812 | 0;0;0;0;0;0;0;0;0;0 | 1013792;39136;1022624;39776;1022976;39904;1018528;39904;1021632;42592 | 207744;0;214400;0;214272;0;211328;0;215680;5376 | |
312 | densenet1_stage3_concat24 | Concat | [1,1536,14,14] | 71.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.00 | 0 | 106453.33 | 529392.00 | 39.50 | 0.00 | 0.00 | true | 0.667999;0.121843;0.668612;0.121809;0.670500;0.121874;0.670566;0.121904;0.669329;0.121812 | 0;0;0;0;0;0;0;0;0;0 | 1013792;39136;1022624;39776;1022976;39904;1018528;39904;1021632;42592 | 207744;0;214400;0;214272;0;211328;0;215680;5376 | |
313 | densenet1_stage3_batchnorm50_fwd | BatchNorm | [1,1584,14,14] | 77.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 1964160 | 25344.00 | 1061301.33 | 31.60 | 1.81 | 245.52 | true | 0.319883;0.319296;0.313892;0.309708;0.315646 | 1964160;1964160;1964160;1964160;1964160 | 25344;25344;25344;25344;25344 | 1059616;1059232;1062560;1065088;1061728 | |
314 | densenet1_stage3_relu50_fwd | Activation | [1,1584,14,14] | 76.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 620928 | 0.00 | 45109.33 | 59.40 | 13.76 | 124.19 | true | 0.594057;0.594823;0.595336;0.592721;0.590657 | 620928;620928;620928;620928;620928 | 0;0;0;0;0 | 45568;45952;41728;44288;45472 | |
315 | densenet1_stage3_conv50_fwd | Convolution | [1,1584,14,14] | 6809.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 101.00 | 136286976 | 1216512.00 | 127925.33 | 3.10 | 101.37 | 1349.38 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 136286976;136286976;136286976;136286976;136286976 | 1216512;1216512;1216512;1216512;1216512 | 127968;127712;128096;126560;128736 | |
316 | densenet1_stage3_batchnorm51_fwd | BatchNorm | [1,192,14,14] | 63.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 132341.33 | 6.80 | 1.76 | 47.62 | true | 0.067994;0.068197;0.068291;0.068116;0.068573 | 238080;238080;238080;238080;238080 | 132384;132384;132256;132736;131744 | 3072;3072;3072;3072;3072 | |
317 | densenet1_stage3_relu51_fwd | Activation | [1,192,14,14] | 19.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 15829.33 | 34.00 | 4.69 | 18.82 | true | 0.340883;0.340039;0.336137;0.341160;0.339330 | 75264;75264;75264;75264;75264 | 224;224;224;256;224 | 15872;15488;16128;14592;16128 | |
318 | densenet1_stage3_conv51_fwd | Convolution | [1,192,14,14] | 2460.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 323253.33 | 12.50 | 81.42 | 529.95 | false | 0.124909;0.124914;0.124911;0.124907;0.124916 | 26320896;26320896;26320896;26320896;26320896 | 309728;329056;316896;332064;323808 | 0;0;0;0;5120 | |
318 | densenet1_stage3_conv51_fwd | Convolution | [1,192,14,14] | 2460.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 712704 | 331776.00 | 289354.67 | 7.50 | 1.15 | 152.71 | true | 0.075106;0.075169;0.074968;0.075173;0.075156 | 712704;712704;712704;712704;712704 | 305952;286624;295200;280096;286240 | 331776;331776;331776;331776;331776 | |
319 | densenet1_stage3_concat25 | Concat | [1,1584,14,14] | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 0.00 | 21968.00 | 40.00 | 0.00 | 0.00 | true | 0.677527;0.121907;0.678569;0.121920;0.677883;0.121829;0.678470;0.121800;0.677418;0.121943 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;768;0 | 13600;30240;13984;30880;13600;30112;3104;29984;13888;31008 | |
319 | densenet1_stage3_concat25 | Concat | [1,1584,14,14] | 76 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.00 | 0 | 0.00 | 21968.00 | 40.00 | 0.00 | 0.00 | true | 0.677527;0.121907;0.678569;0.121920;0.677883;0.121829;0.678470;0.121800;0.677418;0.121943 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;0;768;0 | 13600;30240;13984;30880;13600;30112;3104;29984;13888;31008 | |
320 | densenet1_stage3_batchnorm52_fwd | BatchNorm | [1,1632,14,14] | 94 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 2023680 | 26261.33 | 1234805.33 | 31.80 | 1.60 | 289.10 | true | 0.317793;0.319783;0.317321;0.317172;0.319279 | 2023680;2023680;2023680;2023680;2023680 | 1235904;1233664;1235552;1233760;1235104 | 27264;26208;26400;26176;26176 | |
321 | densenet1_stage3_relu52_fwd | Activation | [1,1632,14,14] | 69 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 639744 | 0.00 | 65589.33 | 58.80 | 9.75 | 127.95 | true | 0.589406;0.586369;0.589714;0.584005;0.586762 | 639744;639744;639744;639744;639744 | 0;0;0;0;0 | 65376;66080;61792;68512;65312 | |
322 | densenet1_stage3_conv52_fwd | Convolution | [1,1632,14,14] | 6967.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 104.00 | 140415744 | 1253376.00 | 150965.33 | 3.10 | 99.99 | 1350.15 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 140415744;140415744;140415744;140415744;140415744 | 1253376;1253376;1253376;1253376;1253376 | 150528;151328;150528;151200;151168 | |
323 | densenet1_stage3_batchnorm53_fwd | BatchNorm | [1,192,14,14] | 66 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 238080 | 3072.00 | 126485.33 | 6.80 | 1.84 | 54.95 | true | 0.067773;0.068097;0.068698;0.067943;0.068093 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 125344;126496;126496;126464;126880 | |
324 | densenet1_stage3_relu53_fwd | Activation | [1,192,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 75264 | 224.00 | 23722.67 | 33.80 | 3.14 | 17.37 | true | 0.337053;0.337741;0.338537;0.376901;0.339024 | 75264;75264;75264;75264;75264 | 224;224;224;224;224 | 23936;23680;24064;21152;23552 | |
325 | densenet1_stage3_conv53_fwd | Convolution | [1,192,14,14] | 2461.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 382122.67 | 12.50 | 68.88 | 526.42 | false | 0.124919;0.124918;0.124930;0.124932;0.124931 | 26320896;26320896;26320896;26320896;26320896 | 380960;388576;380000;385408;369504 | 0;5120;0;0;0 | |
325 | densenet1_stage3_conv53_fwd | Convolution | [1,192,14,14] | 2461.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331861.33 | 376458.67 | 7.50 | 1.01 | 142.54 | true | 0.075337;0.075222;0.075191;0.075274;0.074993 | 712704;712704;712704;712704;712704 | 333824;331776;331776;332032;331776 | 377216;369312;377888;374272;383072 | |
326 | densenet1_stage3_concat26 | Concat | [1,1632,14,14] | 68.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 108096.00 | 611093.33 | 38.80 | 0.00 | 0.00 | true | 0.654804;0.121845;0.655241;0.121781;0.656350;0.121940;0.652943;0.121884;0.656516;0.121815 | 0;0;0;0;0;0;0;0;0;0 | 216448;0;217472;0;217344;0;216064;0;216064;0 | 1191424;38400;1199808;39040;1183744;38144;1192608;38400;1175552;37728 | |
326 | densenet1_stage3_concat26 | Concat | [1,1632,14,14] | 68.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.86 | 0 | 108096.00 | 611093.33 | 38.80 | 0.00 | 0.00 | true | 0.654804;0.121845;0.655241;0.121781;0.656350;0.121940;0.652943;0.121884;0.656516;0.121815 | 0;0;0;0;0;0;0;0;0;0 | 216448;0;217472;0;217344;0;216064;0;216064;0 | 1191424;38400;1199808;39040;1183744;38144;1192608;38400;1175552;37728 | |
327 | densenet1_stage3_batchnorm54_fwd | BatchNorm | [1,1680,14,14] | 77.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 2083200 | 26880.00 | 1278688.00 | 32.40 | 1.60 | 260.40 | true | 0.326775;0.323926;0.323157;0.324035;0.319710 | 2083200;2083200;2083200;2083200;2083200 | 26880;26880;26880;26880;26880 | 1281472;1278304;1275200;1276288;1283616 | |
328 | densenet1_stage3_relu54_fwd | Activation | [1,1680,14,14] | 72.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 658560 | 853.33 | 48384.00 | 58.10 | 13.38 | 141.11 | true | 0.578577;0.580361;0.587490;0.585397;0.578645 | 658560;658560;658560;658560;658560 | 5376;2048;0;512;0 | 47360;49920;46976;51200;47872 | |
329 | densenet1_stage3_conv54_fwd | Convolution | [1,1680,14,14] | 7197.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 106.67 | 144544512 | 1290240.00 | 127541.33 | 3.10 | 101.95 | 1355.10 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 144544512;144544512;144544512;144544512;144544512 | 1290240;1290240;1290240;1290240;1290240 | 127584;127456;127968;127584;127328 | |
330 | densenet1_stage3_batchnorm55_fwd | BatchNorm | [1,192,14,14] | 60.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 132693.33 | 6.80 | 1.75 | 51.01 | true | 0.068344;0.068095;0.068008;0.068143;0.068175 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 133568;132768;132768;132256;132544 | |
331 | densenet1_stage3_relu55_fwd | Activation | [1,192,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 15658.67 | 33.90 | 4.74 | 18.82 | true | 0.339285;0.342732;0.337849;0.337129;0.338732 | 75264;75264;75264;75264;75264 | 224;224;5344;224;224 | 15616;15616;15744;15488;15872 | |
332 | densenet1_stage3_conv55_fwd | Convolution | [1,192,14,14] | 2452.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.33 | 26320896 | 0.00 | 326474.67 | 12.50 | 80.62 | 533.54 | false | 0.124925;0.124920;0.124909;0.124916;0.124920 | 26320896;26320896;26320896;26320896;26320896 | 0;0;512;0;0 | 338016;325472;295904;327072;326880 | |
332 | densenet1_stage3_conv55_fwd | Convolution | [1,192,14,14] | 2452.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 340693.33 | 7.50 | 1.06 | 142.54 | true | 0.075416;0.075204;0.075251;0.075328;0.075259 | 712704;712704;712704;712704;712704 | 331776;331776;341760;331776;331776 | 331360;344320;353248;342784;334976 | |
333 | densenet1_stage3_concat27 | Concat | [1,1680,14,14] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.71 | 0 | 4608.00 | 26106.67 | 40.50 | 0.00 | 0.00 | true | 0.687516;0.121839;0.687025;0.121860;0.687719;0.121840;0.687124;0.121852;0.689074;0.121800 | 0;0;0;0;0;0;0;0;0;0 | 10496;0;10752;0;37120;0;6400;0;31104;0 | 16064;30720;15936;30464;29760;30464;6112;30464;19424;31104 | |
333 | densenet1_stage3_concat27 | Concat | [1,1680,14,14] | 78.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 4608.00 | 26106.67 | 40.50 | 0.00 | 0.00 | true | 0.687516;0.121839;0.687025;0.121860;0.687719;0.121840;0.687124;0.121852;0.689074;0.121800 | 0;0;0;0;0;0;0;0;0;0 | 10496;0;10752;0;37120;0;6400;0;31104;0 | 16064;30720;15936;30464;29760;30464;6112;30464;19424;31104 | |
334 | densenet1_stage3_batchnorm56_fwd | BatchNorm | [1,1728,14,14] | 99 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 2142720 | 27850.67 | 1309002.67 | 32.10 | 1.60 | 267.84 | true | 0.319793;0.327933;0.319787;0.323568;0.312841 | 2142720;2142720;2142720;2142720;2142720 | 1313952;1311424;1297632;1307872;1307712 | 27872;27808;27872;32864;27808 | |
335 | densenet1_stage3_relu56_fwd | Activation | [1,1728,14,14] | 73 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 677376 | 0.00 | 74912.00 | 60.80 | 9.04 | 135.48 | true | 0.614012;0.583159;0.614092;0.615527;0.595924 | 677376;677376;677376;677376;677376 | 72160;74336;76832;76832;73568 | 0;0;0;0;0 | |
336 | densenet1_stage3_conv56_fwd | Convolution | [1,1728,14,14] | 7364.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 110.00 | 148673280 | 1327104.00 | 149280.00 | 3.10 | 100.70 | 1351.58 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 148673280;148673280;148673280;148673280;148673280 | 1327104;1327104;1327104;1327104;1327104 | 150240;148704;149088;150048;147936 | |
337 | densenet1_stage3_batchnorm57_fwd | BatchNorm | [1,192,14,14] | 57.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 126229.33 | 6.80 | 1.84 | 51.01 | true | 0.068095;0.068342;0.068063;0.068621;0.067935 | 238080;238080;238080;238080;238080 | 125856;126592;126240;123776;127648 | 3072;3072;3072;3072;3072 | |
338 | densenet1_stage3_relu57_fwd | Activation | [1,192,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 24106.67 | 33.80 | 3.09 | 18.82 | true | 0.337897;0.337438;0.343676;0.337909;0.338973 | 75264;75264;75264;75264;75264 | 224;224;224;224;224 | 23808;23936;24576;24736;23808 | |
339 | densenet1_stage3_conv57_fwd | Convolution | [1,192,14,14] | 2447.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 393184.00 | 12.50 | 66.94 | 529.95 | false | 0.124933;0.124929;0.124929;0.124925;0.124928 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 393184;393504;393568;392864;382912 | |
339 | densenet1_stage3_conv57_fwd | Convolution | [1,192,14,14] | 2447.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 394282.67 | 7.50 | 0.98 | 142.54 | true | 0.075214;0.075237;0.075177;0.075235;0.075099 | 712704;712704;712704;712704;712704 | 394048;393984;393792;394816;404672 | 331776;331776;331776;331776;331776 | |
340 | densenet1_stage3_concat28 | Concat | [1,1728,14,14] | 69.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.00 | 0 | 112896.00 | 660437.33 | 40.30 | 0.00 | 0.00 | true | 0.687280;0.121857;0.678830;0.121954;0.688998;0.121897;0.689729;0.121759;0.685880;0.121749 | 0;0;0;0;0;0;0;0;0;0 | 225536;512;225024;0;226944;0;224384;2048;225408;0 | 1276320;42080;1279584;42464;1279328;41056;1280832;42208;1280192;42720 | |
340 | densenet1_stage3_concat28 | Concat | [1,1728,14,14] | 69.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 112896.00 | 660437.33 | 40.30 | 0.00 | 0.00 | true | 0.687280;0.121857;0.678830;0.121954;0.688998;0.121897;0.689729;0.121759;0.685880;0.121749 | 0;0;0;0;0;0;0;0;0;0 | 225536;512;225024;0;226944;0;224384;2048;225408;0 | 1276320;42080;1279584;42464;1279328;41056;1280832;42208;1280192;42720 | |
341 | densenet1_stage3_batchnorm58_fwd | BatchNorm | [1,1776,14,14] | 80.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 2202240 | 28416.00 | 1422218.67 | 33.60 | 1.52 | 275.28 | true | 0.334253;0.339449;0.331193;0.346525;0.333828 | 2202240;2202240;2202240;2202240;2202240 | 28416;28416;28416;28416;28416 | 1423680;1418112;1427392;1417248;1424864 | |
342 | densenet1_stage3_relu58_fwd | Activation | [1,1776,14,14] | 75 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 696192 | 0.00 | 59413.33 | 59.90 | 11.72 | 139.24 | true | 0.598111;0.599639;0.596912;0.605139;0.599293 | 696192;696192;696192;696192;696192 | 0;0;0;0;0 | 59648;58368;52736;62976;60224 | |
343 | densenet1_stage3_conv58_fwd | Convolution | [1,1776,14,14] | 7590.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 112.00 | 152802048 | 1363968.00 | 127882.67 | 3.10 | 102.42 | 1364.30 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 152802048;152802048;152802048;152802048;152802048 | 1363968;1363968;1363968;1363968;1363968 | 128096;128224;127584;126816;127968 | |
344 | densenet1_stage3_batchnorm59_fwd | BatchNorm | [1,192,14,14] | 63 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 132714.67 | 6.80 | 1.75 | 51.01 | true | 0.068039;0.067982;0.068047;0.068135;0.068130 | 238080;238080;238080;238080;238080 | 3072;3072;3072;9984;3072 | 133152;132640;132128;132352;133152 | |
345 | densenet1_stage3_relu59_fwd | Activation | [1,192,14,14] | 14.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 992.00 | 15232.00 | 33.90 | 4.64 | 18.82 | true | 0.337839;0.340514;0.336292;0.338862;0.341483 | 75264;75264;75264;75264;75264 | 2528;224;224;2528;224 | 14208;15360;15744;15488;14848 | |
346 | densenet1_stage3_conv59_fwd | Convolution | [1,192,14,14] | 2448 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 375157.33 | 12.50 | 70.16 | 529.95 | false | 0.124907;0.124923;0.124918;0.124908;0.124918 | 26320896;26320896;26320896;26320896;26320896 | 376032;372320;374752;374688;385504 | 0;0;0;0;0 | |
346 | densenet1_stage3_conv59_fwd | Convolution | [1,192,14,14] | 2448 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 376362.67 | 7.50 | 1.01 | 142.54 | true | 0.075135;0.075135;0.075297;0.075330;0.075186 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 378112;371456;379520;379520;369024 | |
347 | densenet1_stage3_concat29 | Concat | [1,1776,14,14] | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.00 | 0 | 3296.00 | 22010.67 | 40.80 | 0.00 | 0.00 | true | 0.693709;0.121949;0.693167;0.121844;0.695842;0.121835;0.695050;0.121836;0.695659;0.121802 | 0;0;0;0;0;0;0;0;0;0 | 14048;30464;13984;30496;13440;30272;3904;29856;13344;30752 | 8128;0;8768;0;8384;0;4032;0;7616;0 | |
347 | densenet1_stage3_concat29 | Concat | [1,1776,14,14] | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.43 | 0 | 3296.00 | 22010.67 | 40.80 | 0.00 | 0.00 | true | 0.693709;0.121949;0.693167;0.121844;0.695842;0.121835;0.695050;0.121836;0.695659;0.121802 | 0;0;0;0;0;0;0;0;0;0 | 8128;0;8768;0;8384;0;4032;0;7616;0 | 14048;30464;13984;30496;13440;30272;3904;29856;13344;30752 | |
348 | densenet1_stage3_batchnorm60_fwd | BatchNorm | [1,1824,14,14] | 96.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 2261760 | 29706.67 | 1387200.00 | 33.60 | 1.60 | 282.72 | true | 0.330043;0.337795;0.333550;0.338373;0.336670 | 2261760;2261760;2261760;2261760;2261760 | 1390688;1385568;1388192;1385600;1387808 | 30272;31360;29376;29248;29472 | |
349 | densenet1_stage3_relu60_fwd | Activation | [1,1824,14,14] | 76.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 715008 | 0.00 | 88672.00 | 62.20 | 8.06 | 143.00 | true | 0.627395;0.605293;0.633280;0.647734;0.604600 | 715008;715008;715008;715008;715008 | 0;0;0;0;0 | 86688;90784;88544;95392;84960 | |
350 | densenet1_stage3_conv60_fwd | Convolution | [1,1824,14,14] | 7807.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 115.00 | 156930816 | 1400832.00 | 149237.33 | 3.10 | 101.24 | 1364.62 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 156930816;156930816;156930816;156930816;156930816 | 1400832;1400832;1400832;1400832;1400832 | 148896;149088;149728;150176;148736 | |
351 | densenet1_stage3_batchnorm61_fwd | BatchNorm | [1,192,14,14] | 61.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 127296.00 | 6.80 | 1.83 | 47.62 | true | 0.068194;0.068333;0.067108;0.068211;0.068191 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 127264;130080;125728;126720;127904 | |
352 | densenet1_stage3_relu61_fwd | Activation | [1,192,14,14] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 22880.00 | 33.80 | 3.26 | 18.82 | true | 0.338353;0.337710;0.338366;0.368150;0.337641 | 75264;75264;75264;75264;75264 | 224;224;9696;224;224 | 23424;20992;24704;22560;22656 | |
353 | densenet1_stage3_conv61_fwd | Convolution | [1,192,14,14] | 2451.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 388821.33 | 12.50 | 67.69 | 529.95 | false | 0.124918;0.124929;0.124925;0.124931;0.124920 | 26320896;26320896;26320896;26320896;26320896 | 393632;384800;389344;392320;378528 | 0;0;0;0;512 | |
353 | densenet1_stage3_conv61_fwd | Convolution | [1,192,14,14] | 2451.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 332458.67 | 396266.67 | 7.50 | 0.98 | 142.54 | true | 0.075178;0.075030;0.075104;0.075135;0.075203 | 712704;712704;712704;712704;712704 | 331776;333824;333824;331776;331776 | 394336;402400;399008;394848;394944 | |
354 | densenet1_stage3_concat30 | Concat | [1,1824,14,14] | 72 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.00 | 0 | 209813.33 | 695146.67 | 40.60 | 0.00 | 0.00 | true | 0.692748;0.122013;0.691275;0.121852;0.696854;0.121871;0.692623;0.121921;0.688824;0.121872 | 0;0;0;0;0;0;0;0;0;0 | 412288;0;426368;0;431232;0;420224;0;427264;0 | 1347712;42208;1348096;41344;1348672;43360;1347264;42240;1350880;40960 | |
354 | densenet1_stage3_concat30 | Concat | [1,1824,14,14] | 72 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.43 | 0 | 209813.33 | 695146.67 | 40.60 | 0.00 | 0.00 | true | 0.692748;0.122013;0.691275;0.121852;0.696854;0.121871;0.692623;0.121921;0.688824;0.121872 | 0;0;0;0;0;0;0;0;0;0 | 412288;0;426368;0;431232;0;420224;0;427264;0 | 1347712;42208;1348096;41344;1348672;43360;1347264;42240;1350880;40960 | |
355 | densenet1_stage3_batchnorm62_fwd | BatchNorm | [1,1872,14,14] | 83 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.33 | 2321280 | 29952.00 | 1516064.00 | 34.60 | 1.50 | 278.56 | true | 0.342993;0.349304;0.348440;0.344447;0.346117 | 2321280;2321280;2321280;2321280;2321280 | 29952;29952;29952;29952;29952 | 1513536;1518240;1520704;1516416;1513536 | |
356 | densenet1_stage3_relu62_fwd | Activation | [1,1872,14,14] | 78.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 733824 | 0.00 | 68448.00 | 61.80 | 10.72 | 146.76 | true | 0.617640;0.629591;0.617022;0.620347;0.613161 | 733824;733824;733824;733824;733824 | 0;0;0;0;0 | 72000;65536;62080;68096;71712 | |
357 | densenet1_stage3_conv62_fwd | Convolution | [1,1872,14,14] | 8015 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 118.00 | 161059584 | 1437696.00 | 134581.33 | 3.10 | 102.44 | 1364.91 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 161059584;161059584;161059584;161059584;161059584 | 1437696;1437696;1437696;1437952;1437696 | 134368;133984;134752;135520;134624 | |
358 | densenet1_stage3_batchnorm63_fwd | BatchNorm | [1,192,14,14] | 57.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 238080 | 3072.00 | 134058.67 | 6.80 | 1.74 | 54.95 | true | 0.068193;0.068190;0.068143;0.068243;0.068552 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 133696;133920;135968;134560;133152 | |
359 | densenet1_stage3_relu63_fwd | Activation | [1,192,14,14] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 75264 | 224.00 | 16469.33 | 35.00 | 4.51 | 17.37 | true | 0.368523;0.338626;0.389824;0.338377;0.343530 | 75264;75264;75264;75264;75264 | 224;224;224;224;224 | 17024;16256;14592;16128;17536 | |
360 | densenet1_stage3_conv63_fwd | Convolution | [1,192,14,14] | 2469.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 0.00 | 392544.00 | 12.50 | 67.05 | 529.95 | false | 0.124920;0.124917;0.124910;0.124919;0.124910 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 392288;379104;392672;393120;392672 | |
360 | densenet1_stage3_conv63_fwd | Convolution | [1,192,14,14] | 2469.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 394272.00 | 7.50 | 0.98 | 142.54 | true | 0.075272;0.075250;0.075238;0.075241;0.075037 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 394528;394240;394368;392832;394208 | |
361 | densenet1_stage3_concat31 | Concat | [1,1872,14,14] | 83.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.14 | 0 | 6464.00 | 53898.67 | 41.20 | 0.00 | 0.00 | true | 0.705239;0.121904;0.703807;0.121970;0.707676;0.121899;0.701557;0.121918;0.703559;0.121874 | 0;0;0;0;0;0;0;0;0;0 | 17792;0;13440;0;11904;0;19968;0;13440;0 | 86304;29728;82080;30336;84576;30496;65472;30080;84544;30464 | |
361 | densenet1_stage3_concat31 | Concat | [1,1872,14,14] | 83.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 6464.00 | 53898.67 | 41.20 | 0.00 | 0.00 | true | 0.705239;0.121904;0.703807;0.121970;0.707676;0.121899;0.701557;0.121918;0.703559;0.121874 | 0;0;0;0;0;0;0;0;0;0 | 17792;0;13440;0;11904;0;19968;0;13440;0 | 86304;29728;82080;30336;84576;30496;65472;30080;84544;30464 | |
362 | densenet1_stage3_batchnorm64_fwd | BatchNorm | [1,1920,14,14] | 102.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.00 | 2380800 | 30880.00 | 1392352.00 | 33.90 | 1.67 | 297.60 | true | 0.339409;0.341602;0.337751;0.340167;0.335515 | 2380800;2380800;2380800;2380800;2380800 | 1389984;1395904;1387872;1400768;1391168 | 30912;30912;30816;30784;31072 | |
363 | densenet1_stage3_relu64_fwd | Activation | [1,1920,14,14] | 80.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 752640 | 0.00 | 106677.33 | 64.20 | 7.06 | 150.53 | true | 0.643533;0.639143;0.647165;0.643925;0.624024 | 752640;752640;752640;752640;752640 | 0;0;0;128;0 | 106368;105760;112576;106528;107136 | |
364 | densenet1_stage3_conv64_fwd | Convolution | [1,1920,14,14] | 8212.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 121.00 | 165188352 | 1474560.00 | 137258.67 | 3.10 | 102.49 | 1365.19 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 165188352;165188352;165188352;165188352;165188352 | 1474560;1474560;1474560;1474560;1474560 | 137376;137824;136928;136928;137472 | |
365 | densenet1_stage3_batchnorm65_fwd | BatchNorm | [1,192,14,14] | 67.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 140160.00 | 6.80 | 1.66 | 47.62 | true | 0.068013;0.068203;0.068150;0.068168;0.067252 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;3072 | 138592;139680;140384;140416;140576 | |
366 | densenet1_stage3_relu65_fwd | Activation | [1,192,14,14] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 22901.33 | 33.90 | 3.25 | 18.82 | true | 0.339553;0.334291;0.362697;0.339496;0.339100 | 75264;75264;75264;75264;75264 | 224;224;224;224;224 | 23040;23296;22784;22880;22528 | |
367 | densenet1_stage3_conv65_fwd | Convolution | [1,192,14,14] | 2458.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 392149.33 | 12.50 | 67.12 | 526.42 | false | 0.124926;0.124921;0.124932;0.124929;0.124920 | 26320896;26320896;26320896;26320896;26320896 | 0;0;0;0;0 | 392928;382720;390336;393472;393184 | |
367 | densenet1_stage3_conv65_fwd | Convolution | [1,192,14,14] | 2458.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 394453.33 | 7.50 | 0.98 | 142.54 | true | 0.075374;0.075122;0.075222;0.075255;0.075182 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 394560;404544;394112;394688;394016 | |
368 | densenet1_stage3_concat32 | Concat | [1,1920,14,14] | 73.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.14 | 0 | 338218.67 | 722480.00 | 41.00 | 0.00 | 0.00 | true | 0.699722;0.121981;0.703723;0.121975;0.707957;0.122002;0.691220;0.121915;0.701644;0.121974 | 0;0;0;0;0;0;0;0;0;0 | 678144;256;673664;2048;678784;0;680704;0;675200;0 | 1409632;40672;1407104;40416;1403424;40288;1404992;40288;1405088;39264 | |
368 | densenet1_stage3_concat32 | Concat | [1,1920,14,14] | 73.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 338218.67 | 722480.00 | 41.00 | 0.00 | 0.00 | true | 0.699722;0.121981;0.703723;0.121975;0.707957;0.122002;0.691220;0.121915;0.701644;0.121974 | 0;0;0;0;0;0;0;0;0;0 | 678144;256;673664;2048;678784;0;680704;0;675200;0 | 1409632;40672;1407104;40416;1403424;40288;1404992;40288;1405088;39264 | |
369 | densenet1_stage3_batchnorm66_fwd | BatchNorm | [1,1968,14,14] | 87 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.33 | 2440320 | 31488.00 | 1609962.67 | 38.20 | 1.49 | 292.85 | true | 0.401904;0.353472;0.355362;0.408500;0.389216 | 2440320;2440320;2440320;2440320;2440320 | 31488;31488;31488;31488;31488 | 1605632;1608928;1612672;1610240;1610720 | |
370 | densenet1_stage3_relu66_fwd | Activation | [1,1968,14,14] | 82 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 771456 | 0.00 | 87989.33 | 63.30 | 8.77 | 154.29 | true | 0.626380;0.643644;0.633516;0.634812;0.631487 | 771456;771456;771456;771456;771456 | 0;0;0;0;0 | 86304;85024;90688;86976;91680 | |
371 | densenet1_stage3_conv66_fwd | Convolution | [1,1968,14,14] | 8438.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 124.00 | 169317120 | 1511424.00 | 152330.67 | 3.10 | 101.77 | 1365.46 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 169317120;169317120;169317120;169317120;169317120 | 152544;152160;152672;151648;152288 | 1511424;1511424;1511424;1511424;1511424 | |
372 | densenet1_stage3_batchnorm67_fwd | BatchNorm | [1,192,14,14] | 65.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 132725.33 | 6.80 | 1.75 | 51.01 | true | 0.068131;0.068098;0.068117;0.068086;0.068246 | 238080;238080;238080;238080;238080 | 3072;3072;3072;3072;9728 | 132128;132768;132768;133536;132640 | |
373 | densenet1_stage3_relu67_fwd | Activation | [1,192,14,14] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 75264 | 224.00 | 17749.33 | 33.80 | 4.19 | 16.13 | true | 0.337967;0.338438;0.337682;0.338112;0.341056 | 75264;75264;75264;75264;75264 | 224;224;1504;224;224 | 18432;18048;17280;17280;17920 | |
374 | densenet1_stage3_conv67_fwd | Convolution | [1,192,14,14] | 2458.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.33 | 26320896 | 0.00 | 392842.67 | 12.50 | 67.00 | 533.54 | false | 0.124917;0.124903;0.124912;0.124907;0.124916 | 26320896;26320896;26320896;26320896;26320896 | 0;1536;0;0;0 | 393056;392736;392800;392480;392992 | |
374 | densenet1_stage3_conv67_fwd | Convolution | [1,192,14,14] | 2458.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331776.00 | 394090.67 | 7.50 | 0.98 | 142.54 | true | 0.075199;0.075286;0.075260;0.075083;0.075162 | 712704;712704;712704;712704;712704 | 331776;331776;331776;331776;331776 | 393728;394400;394112;394112;394048 | |
375 | densenet1_stage3_concat33 | Concat | [1,1968,14,14] | 84.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.86 | 0 | 4021.33 | 115776.00 | 41.60 | 0.00 | 0.00 | true | 0.710851;0.121915;0.709533;0.121852;0.711723;0.121897;0.710963;0.121916;0.711508;0.121894 | 0;0;0;0;0;0;0;0;0;0 | 12352;0;14400;0;11456;0;1088;0;11584;0 | 203392;32512;205472;31936;202208;32736;191872;31808;204288;31360 | |
375 | densenet1_stage3_concat33 | Concat | [1,1968,14,14] | 84.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.29 | 0 | 4021.33 | 115776.00 | 41.60 | 0.00 | 0.00 | true | 0.710851;0.121915;0.709533;0.121852;0.711723;0.121897;0.710963;0.121916;0.711508;0.121894 | 0;0;0;0;0;0;0;0;0;0 | 12352;0;14400;0;11456;0;1088;0;11584;0 | 203392;32512;205472;31936;202208;32736;191872;31808;204288;31360 | |
376 | densenet1_stage3_batchnorm68_fwd | BatchNorm | [1,2016,14,14] | 104.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.67 | 2499840 | 32384.00 | 1531936.00 | 35.00 | 1.60 | 288.43 | true | 0.349808;0.340370;0.349705;0.351896;0.355027 | 2499840;2499840;2499840;2499840;2499840 | 32416;32352;35008;32352;32384 | 1530432;1532352;1535232;1531296;1532160 | |
377 | densenet1_stage3_relu68_fwd | Activation | [1,2016,14,14] | 84 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 790272 | 42.67 | 118421.33 | 65.30 | 6.67 | 158.05 | true | 0.637391;0.644514;0.660510;0.657847;0.658028 | 790272;790272;790272;790272;790272 | 128;0;128;0;0 | 121952;120000;116992;116672;118272 | |
378 | densenet1_stage3_conv68_fwd | Convolution | [1,2016,14,14] | 8828 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 126.67 | 173445888 | 1548288.00 | 120736.00 | 3.10 | 103.92 | 1369.31 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 173445888;173445888;173445888;173445888;173445888 | 1548288;1548288;1548288;1548288;1548288 | 121216;121248;121024;119968;118880 | |
379 | densenet1_stage3_batchnorm69_fwd | BatchNorm | [1,192,14,14] | 68.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 238080 | 3072.00 | 157034.67 | 6.80 | 1.49 | 51.01 | true | 0.068069;0.068205;0.068163;0.068028;0.068272 | 238080;238080;238080;238080;238080 | 156992;154912;156736;157376;158432 | 3072;3072;3072;3072;3072 | |
380 | densenet1_stage3_relu69_fwd | Activation | [1,192,14,14] | 16.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 23029.33 | 33.80 | 3.24 | 18.82 | true | 0.337826;0.337679;0.337821;0.337845;0.339175 | 75264;75264;75264;75264;75264 | 22784;24096;22400;22944;23360 | 224;224;224;224;5344 | |
381 | densenet1_stage3_conv69_fwd | Convolution | [1,192,14,14] | 2628.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 49.67 | 26320896 | 170.67 | 389930.67 | 12.50 | 67.47 | 529.95 | false | 0.124922;0.124929;0.124929;0.124934;0.124929 | 26320896;26320896;26320896;26320896;26320896 | 1536;256;0;0;256 | 392704;365984;393440;393024;384064 | |
381 | densenet1_stage3_conv69_fwd | Convolution | [1,192,14,14] | 2628.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 712704 | 331776.00 | 395114.67 | 7.50 | 0.98 | 152.71 | true | 0.075050;0.075241;0.075346;0.075503;0.075360 | 712704;712704;712704;712704;712704 | 331776;337664;331776;331776;331776 | 395456;407328;394656;395232;394272 | |
382 | densenet1_stage3_concat34 | Concat | [1,2016,14,14] | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.43 | 0 | 458069.33 | 769066.67 | 41.70 | 0.00 | 0.00 | true | 0.706794;0.121982;0.716238;0.121909;0.721638;0.121931;0.724295;0.121958;0.715588;0.121827 | 0;0;0;0;0;0;0;0;0;0 | 1490944;44704;1493248;45216;1493504;48032;1493696;44704;1492256;44064 | 910592;0;930048;0;916736;0;922240;0;921088;0 | |
382 | densenet1_stage3_concat34 | Concat | [1,2016,14,14] | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.86 | 0 | 458069.33 | 769066.67 | 41.70 | 0.00 | 0.00 | true | 0.706794;0.121982;0.716238;0.121909;0.721638;0.121931;0.724295;0.121958;0.715588;0.121827 | 0;0;0;0;0;0;0;0;0;0 | 910592;0;930048;0;916736;0;922240;0;921088;0 | 1490944;44704;1493248;45216;1493504;48032;1493696;44704;1492256;44064 | |
383 | densenet1_stage3_batchnorm70_fwd | BatchNorm | [1,2064,14,14] | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.33 | 2559360 | 33024.00 | 1671200.00 | 39.50 | 1.50 | 307.14 | true | 0.389591;0.403260;0.396036;0.389100;0.397959 | 2559360;2559360;2559360;2559360;2559360 | 33088;33024;33024;33024;33024 | 1673280;1670368;1670080;1670336;1672896 | |
384 | densenet1_stage3_relu70_fwd | Activation | [1,2064,14,14] | 94.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 809088 | 0.00 | 99253.33 | 64.50 | 8.15 | 161.82 | true | 0.643839;0.646265;0.648109;0.645602;0.642480 | 809088;809088;809088;809088;809088 | 0;0;0;0;0 | 98176;99488;98368;100288;99904 | |
385 | densenet1_stage3_conv70_fwd | Convolution | [1,2064,14,14] | 9050.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 130.00 | 177574656 | 1585152.00 | 148661.33 | 3.10 | 102.42 | 1365.96 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 177574656;177574656;177574656;177574656;177574656 | 148096;149056;148032;148992;148896 | 1585152;1585152;1585152;1585152;1585152 | |
386 | densenet1_stage3_batchnorm71_fwd | BatchNorm | [1,192,14,14] | 61.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.00 | 238080 | 3072.00 | 136949.33 | 6.80 | 1.70 | 47.62 | true | 0.068785;0.067172;0.067501;0.067530;0.067473 | 238080;238080;238080;238080;238080 | 3072;3072;3072;4096;3072 | 138016;136224;136608;138688;136096 | |
387 | densenet1_stage3_relu71_fwd | Activation | [1,192,14,14] | 15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 75264 | 224.00 | 18389.33 | 34.10 | 4.04 | 18.82 | true | 0.337913;0.345841;0.337706;0.337779;0.361323 | 75264;75264;75264;75264;75264 | 224;224;224;224;288 | 17536;19200;18432;16512;19552 | |
388 | densenet1_stage3_conv71_fwd | Convolution | [1,192,14,14] | 2448.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.00 | 26320896 | 0.00 | 399008.00 | 12.50 | 65.97 | 526.42 | false | 0.124910;0.124909;0.124919;0.124908;0.124914 | 26320896;26320896;26320896;26320896;26320896 | 397440;388992;399776;399840;399808 | 0;0;0;0;512 | |
388 | densenet1_stage3_conv71_fwd | Convolution | [1,192,14,14] | 2448.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 712704 | 331840.00 | 397450.67 | 7.50 | 0.98 | 142.54 | true | 0.075292;0.075098;0.075124;0.075205;0.075173 | 712704;712704;712704;712704;712704 | 331840;331840;331840;331840;332288 | 397664;408256;397632;396032;397056 | |
389 | densenet1_stage3_concat35 | Concat | [1,2064,14,14] | 89.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 6.14 | 0 | 169546.67 | 185509.33 | 42.00 | 0.00 | 0.00 | true | 0.719265;0.121926;0.718390;0.121917;0.719537;0.121918;0.718716;0.121903;0.717171;0.121913 | 0;0;0;0;0;0;0;0;0;0 | 341216;31616;341120;31744;340768;31872;337920;31360;339136;31488 | 340608;0;337152;0;340480;0;342144;0;339648;0 | |
389 | densenet1_stage3_concat35 | Concat | [1,2064,14,14] | 89.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 5.57 | 0 | 169546.67 | 185509.33 | 42.00 | 0.00 | 0.00 | true | 0.719265;0.121926;0.718390;0.121917;0.719537;0.121918;0.718716;0.121903;0.717171;0.121913 | 0;0;0;0;0;0;0;0;0;0 | 341216;31616;341120;31744;340768;31872;337920;31360;339136;31488 | 340608;0;337152;0;340480;0;342144;0;339648;0 | |
390 | densenet1_batchnorm3_fwd | BatchNorm | [1,2112,14,14] | 115.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 8.33 | 2618880 | 34090.67 | 1624490.67 | 35.20 | 1.58 | 314.28 | true | 0.351511;0.354615;0.348305;0.348535;0.363366 | 2618880;2618880;2618880;2618880;2618880 | 1627616;1627904;1624160;1618272;1621696 | 34016;34048;34240;34144;34080 | |
391 | densenet1_relu3_fwd | Activation | [1,2112,14,14] | 88 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 827904 | 85.33 | 143402.67 | 66.90 | 5.77 | 165.58 | true | 0.672886;0.672392;0.666198;0.668537;0.643200 | 827904;827904;827904;827904;827904 | 128;128;128;0;0 | 140640;141952;143456;152928;144800 | |
392 | densenet1_conv3_fwd | Convolution | [1,2112,14,14] | 47454.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 197.00 | 999368832 | 9681461.33 | 2655008.00 | 8.30 | 81.01 | 5072.94 | false | 0.082875;0.083336;0.082746;0.082354;0.082805 | 999368832;999368832;999368832;999368832;999368832 | 2648096;2662368;2649280;2655264;2660480 | 9683680;9665248;9677024;9683680;9705536 | |
393 | densenet1_pool3_fwd | Pooling | [1,1056,14,14] | 594.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 6.00 | 1086624 | 725.33 | 15488.00 | 17.70 | 67.02 | 181.10 | false | 0.177011;0.176801;0.176873;0.176838;0.176921 | 1086624;1086624;1086624;1086624;1086624 | 128;128;1920;128;2176 | 14976;15104;16384;17792;13696 | |
394 | densenet1_stage4_batchnorm0_fwd | BatchNorm | [1,1056,7,7] | 85.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 362208 | 17056.00 | 1493.33 | 21.00 | 19.53 | 60.37 | false | 0.208390;0.208488;0.208277;0.213532;0.219724 | 362208;362208;362208;362208;362208 | 17056;17056;17056;17056;17056 | 1280;2688;640;896;2304 | |
395 | densenet1_stage4_relu0_fwd | Activation | [1,1056,7,7] | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 103488 | 224.00 | 42.67 | 37.80 | 388.08 | 25.87 | false | 0.380228;0.374164;0.416789;0.378565;0.375527 | 103488;103488;103488;103488;103488 | 224;224;224;224;224 | 0;0;128;128;0 | |
396 | densenet1_stage4_conv0_fwd | Convolution | [1,1056,7,7] | 1445 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 69.00 | 25961664 | 811008.00 | 53845.33 | 3.10 | 30.02 | 376.26 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 25961664;25961664;25961664;25961664;25961664 | 59904;46976;54144;54400;52992 | 811008;811008;811008;811008;811008 | |
397 | densenet1_stage4_batchnorm1_fwd | BatchNorm | [1,192,7,7] | 17.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 1237.33 | 6.80 | 15.28 | 16.46 | true | 0.068464;0.068558;0.068311;0.068216;0.068307 | 65856;65856;65856;65856;65856 | 1152;1792;1664;768;896 | 3072;3072;3072;3072;3072 | |
398 | densenet1_stage4_relu1_fwd | Activation | [1,192,7,7] | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 0.00 | 29.90 | 84.00 | 4.70 | false | 0.291330;0.299727;0.335953;0.302064;0.294067 | 18816;18816;18816;18816;18816 | 224;224;1248;224;224 | 0;0;0;0;0 | |
399 | densenet1_stage4_conv1_fwd | Convolution | [1,192,7,7] | 761.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 28.00 | 113541120 | 7936.00 | 442784.00 | 6.20 | 251.91 | 4055.04 | false | 0.062459;0.062460;0.062459;0.062459;0.062460 | 113541120;113541120;113541120;113541120;113541120 | 8960;8448;7680;7424;7680 | 442528;442784;443168;442144;443040 | |
399 | densenet1_stage4_conv1_fwd | Convolution | [1,192,7,7] | 761.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 7.00 | 2949120 | 5973.33 | 29610.67 | 24.10 | 82.88 | 421.30 | false | 0.240616;0.247580;0.239637;0.241091;0.240058 | 2949120;2949120;2949120;2949120;2949120 | 7424;5376;6144;5632;6144 | 33056;29472;35104;25344;26304 | |
399 | densenet1_stage4_conv1_fwd | Convolution | [1,192,7,7] | 761.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 6.00 | 2399424 | 5568.00 | 399637.33 | 8.30 | 5.92 | 399.90 | true | 0.084539;0.077916;0.084202;0.080773;0.083968 | 2399424;2399424;2399424;2399424;2399424 | 15424;6080;5312;5312;5312 | 392160;401472;401216;396320;401376 | |
399 | densenet1_stage4_conv1_fwd | Convolution | [1,192,7,7] | 761.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.67 | 746496 | 334677.33 | 1064416.00 | 12.30 | 0.53 | 131.73 | true | 0.123575;0.123588;0.123362;0.123436;0.123321 | 746496;746496;746496;746496;746496 | 336384;335360;334080;334592;334080 | 1064992;1064192;1064064;1063968;1066752 | |
400 | densenet1_stage4_concat0 | Concat | [1,1056,7,7] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.57 | 0 | 6634.67 | 5541.33 | 19.30 | 0.00 | 0.00 | true | 0.265672;0.119899;0.265818;0.119880;0.265894;0.119895;0.265738;0.119901;0.265918;0.119855 | 0;0;0;0;0;0;0;0;0;0 | 6048;6528;14208;7296;416;6528;384;6688;384;7040 | 13184;0;14720;256;14464;0;13312;0;12800;256 | |
400 | densenet1_stage4_concat0 | Concat | [1,1056,7,7] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.43 | 0 | 6634.67 | 5541.33 | 19.30 | 0.00 | 0.00 | true | 0.265672;0.119899;0.265818;0.119880;0.265894;0.119895;0.265738;0.119901;0.265918;0.119855 | 0;0;0;0;0;0;0;0;0;0 | 6048;6528;14208;7296;416;6528;384;6688;384;7040 | 13184;0;14720;256;14464;0;13312;0;12800;256 | |
401 | densenet1_stage4_batchnorm2_fwd | BatchNorm | [1,1104,7,7] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 378672 | 17664.00 | 414944.00 | 20.70 | 0.88 | 63.11 | true | 0.206468;0.207624;0.205467;0.223481;0.207822 | 378672;378672;378672;378672;378672 | 17664;17664;17664;20224;17664 | 413088;394592;419808;421184;411936 | |
402 | densenet1_stage4_relu2_fwd | Activation | [1,1104,7,7] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 108192 | 0.00 | 6186.67 | 36.20 | 17.49 | 27.05 | false | 0.361657;0.362132;0.363284;0.362813;0.361175 | 108192;108192;108192;108192;108192 | 5888;6656;6272;6400;5632 | 0;0;0;0;4864 | |
403 | densenet1_stage4_conv2_fwd | Convolution | [1,1104,7,7] | 1510.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 72.00 | 27141312 | 848725.33 | 38752.00 | 3.10 | 30.58 | 376.96 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 27141312;27141312;27141312;27141312;27141312 | 849664;848896;848640;848640;848640 | 38688;38656;38784;38784;38912 | |
404 | densenet1_stage4_batchnorm3_fwd | BatchNorm | [1,192,7,7] | 18.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 27466.67 | 6.80 | 2.16 | 16.46 | true | 0.067931;0.068040;0.067857;0.068174;0.068061 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;3072 | 27424;27552;27424;27552;27296 | |
405 | densenet1_stage4_relu3_fwd | Activation | [1,192,7,7] | 9 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 128.00 | 810.67 | 28.20 | 20.05 | 4.70 | false | 0.289214;0.293031;0.268041;0.261421;0.288909 | 18816;18816;18816;18816;18816 | 128;128;128;0;2048 | 896;640;768;768;1024 | |
406 | densenet1_stage4_conv3_fwd | Convolution | [1,192,7,7] | 751 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.33 | 113541120 | 0.00 | 326624.00 | 6.20 | 347.62 | 4311.74 | false | 0.062454;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 0;0;0;0;0 | 321216;334208;334848;323680;321984 | |
406 | densenet1_stage4_conv3_fwd | Convolution | [1,192,7,7] | 751 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 0.00 | 650112.00 | 22.20 | 4.54 | 491.52 | true | 0.221487;0.221829;0.221755;0.222200;0.221763 | 2949120;2949120;2949120;2949120;2949120 | 0;0;0;0;0 | 660224;661696;610752;628416;665472 | |
406 | densenet1_stage4_conv3_fwd | Convolution | [1,192,7,7] | 751 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.33 | 746496 | 331776.00 | 603818.67 | 12.30 | 0.80 | 139.98 | true | 0.123109;0.123114;0.123171;0.123005;0.123203 | 746496;746496;746496;746496;746496 | 331776;331776;331776;331776;331776 | 589952;564288;643680;638336;583168 | |
406 | densenet1_stage4_conv3_fwd | Convolution | [1,192,7,7] | 751 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 42.67 | 398549.33 | 8.70 | 6.02 | 479.88 | true | 0.087268;0.087237;0.087141;0.087127;0.087234 | 2399424;2399424;2399424;2399424;2399424 | 128;8192;0;0;0 | 398720;395392;398848;399232;398080 | |
407 | densenet1_stage4_concat1 | Concat | [1,1104,7,7] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 21.33 | 6805.33 | 19.70 | 0.00 | 0.00 | true | 0.274049;0.119931;0.274417;0.119966;0.274844;0.119956;0.275021;0.119929;0.274680;0.119958 | 0;0;0;0;0;0;0;0;0;0 | 1792;0;1408;0;0;0;0;0;128;0 | 10496;9856;1408;9600;640;10112;512;9600;768;9600 | |
407 | densenet1_stage4_concat1 | Concat | [1,1104,7,7] | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.57 | 0 | 21.33 | 6805.33 | 19.70 | 0.00 | 0.00 | true | 0.274049;0.119931;0.274417;0.119966;0.274844;0.119956;0.275021;0.119929;0.274680;0.119958 | 0;0;0;0;0;0;0;0;0;0 | 1792;0;1408;0;0;0;0;0;128;0 | 10496;9856;1408;9600;640;10112;512;9600;768;9600 | |
408 | densenet1_stage4_batchnorm4_fwd | BatchNorm | [1,1152,7,7] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 5.67 | 395136 | 18432.00 | 369728.00 | 19.50 | 1.02 | 69.73 | true | 0.195006;0.193300;0.195585;0.189544;0.200364 | 395136;395136;395136;395136;395136 | 18432;18432;18432;18432;18432 | 359104;367136;383104;374528;367520 | |
409 | densenet1_stage4_relu4_fwd | Activation | [1,1152,7,7] | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 112896 | 0.00 | 5536.00 | 36.90 | 20.39 | 28.22 | false | 0.369513;0.368034;0.368139;0.369127;0.368813 | 112896;112896;112896;112896;112896 | 0;0;0;0;0 | 5408;5376;5408;6176;5792 | |
410 | densenet1_stage4_conv4_fwd | Convolution | [1,1152,7,7] | 1571.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 75.00 | 28320960 | 884992.00 | 28789.33 | 3.10 | 30.99 | 377.61 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 28320960;28320960;28320960;28320960;28320960 | 884736;884736;885504;884736;891392 | 28768;28992;28864;28736;28480 | |
411 | densenet1_stage4_batchnorm5_fwd | BatchNorm | [1,192,7,7] | 17.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 37024.00 | 6.80 | 1.64 | 16.46 | true | 0.068673;0.068440;0.068594;0.066141;0.068308 | 65856;65856;65856;65856;65856 | 3072;3072;3328;3072;3072 | 37024;36768;37152;36896;37280 | |
412 | densenet1_stage4_relu5_fwd | Activation | [1,192,7,7] | 10.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 18816 | 149.33 | 981.33 | 27.00 | 16.64 | 4.34 | true | 0.273264;0.269535;0.263325;0.267997;0.289663 | 18816;18816;18816;18816;18816 | 224;224;224;0;0 | 896;1024;1024;1024;896 | |
413 | densenet1_stage4_conv5_fwd | Convolution | [1,192,7,7] | 756.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 359893.33 | 6.20 | 315.49 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 0;0;0;0;0 | 357440;360768;360960;357952;370560 | |
413 | densenet1_stage4_conv5_fwd | Convolution | [1,192,7,7] | 756.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 0.00 | 526506.67 | 22.30 | 5.60 | 491.52 | true | 0.221921;0.221835;0.221387;0.224166;0.224410 | 2949120;2949120;2949120;2949120;2949120 | 520480;538848;534016;518464;525024 | 256;0;0;0;0 | |
413 | densenet1_stage4_conv5_fwd | Convolution | [1,192,7,7] | 756.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 601333.33 | 12.30 | 0.80 | 149.30 | true | 0.123141;0.123118;0.123141;0.123395;0.123164 | 746496;746496;746496;746496;746496 | 331776;331776;331776;331776;331776 | 596960;600000;593920;609344;607040 | |
413 | densenet1_stage4_conv5_fwd | Convolution | [1,192,7,7] | 756.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 1109.33 | 398250.67 | 8.70 | 6.01 | 479.88 | true | 0.087205;0.087192;0.087163;0.087249;0.087142 | 2399424;2399424;2399424;2399424;2399424 | 0;0;256;6400;3072 | 399360;399488;399616;390272;395904 | |
414 | densenet1_stage4_concat2 | Concat | [1,1152,7,7] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.57 | 0 | 224.00 | 5194.67 | 20.20 | 0.00 | 0.00 | true | 0.283102;0.119942;0.283712;0.119942;0.283365;0.119965;0.283501;0.119978;0.283434;0.119714 | 0;0;0;0;0;0;0;0;0;0 | 1088;0;1216;0;1088;0;192;0;64;0 | 704;9472;448;9856;832;9728;704;9600;832;9728 | |
414 | densenet1_stage4_concat2 | Concat | [1,1152,7,7] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.57 | 0 | 224.00 | 5194.67 | 20.20 | 0.00 | 0.00 | true | 0.283102;0.119942;0.283712;0.119942;0.283365;0.119965;0.283501;0.119978;0.283434;0.119714 | 0;0;0;0;0;0;0;0;0;0 | 1088;0;1216;0;1088;0;192;0;64;0 | 704;9472;448;9856;832;9728;704;9600;832;9728 | |
415 | densenet1_stage4_batchnorm6_fwd | BatchNorm | [1,1200,7,7] | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 411600 | 19200.00 | 405813.33 | 19.50 | 0.97 | 68.60 | true | 0.201549;0.196830;0.187905;0.190377;0.197850 | 411600;411600;411600;411600;411600 | 19200;19200;19200;19200;19200 | 409888;405824;401728;401504;411616 | |
416 | densenet1_stage4_relu6_fwd | Activation | [1,1200,7,7] | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 117600 | 0.00 | 5418.67 | 37.40 | 21.70 | 29.40 | false | 0.374491;0.374513;0.374987;0.374129;0.373833 | 117600;117600;117600;117600;117600 | 0;0;0;0;0 | 5376;5248;5760;5120;5632 | |
417 | densenet1_stage4_conv6_fwd | Convolution | [1,1200,7,7] | 1630 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 77.67 | 29500608 | 921600.00 | 29024.00 | 3.10 | 31.03 | 379.83 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 29500608;29500608;29500608;29500608;29500608 | 921600;921600;921600;921600;921600 | 29088;28928;28416;29056;29120 | |
418 | densenet1_stage4_batchnorm7_fwd | BatchNorm | [1,192,7,7] | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 36682.67 | 6.80 | 1.66 | 16.46 | true | 0.067844;0.069001;0.067975;0.067174;0.067810 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;5888 | 36768;36672;36640;36640;36736 | |
419 | densenet1_stage4_relu7_fwd | Activation | [1,192,7,7] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 992.00 | 29.30 | 15.47 | 4.70 | true | 0.294302;0.292781;0.287626;0.291909;0.294906 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 896;1024;1152;1024;928 | |
420 | densenet1_stage4_conv7_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 357088.00 | 6.20 | 317.96 | 4366.97 | false | 0.062455;0.062455;0.062454;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 359072;360416;363712;348352;351776 | 256;0;0;0;0 | |
420 | densenet1_stage4_conv7_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.67 | 2949120 | 0.00 | 560960.00 | 22.20 | 5.26 | 442.35 | true | 0.223728;0.221886;0.222028;0.221721;0.221862 | 2949120;2949120;2949120;2949120;2949120 | 0;0;0;256;0 | 557344;561856;558464;562560;571072 | |
420 | densenet1_stage4_conv7_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 596021.33 | 12.30 | 0.80 | 149.30 | true | 0.123247;0.123209;0.123083;0.123105;0.123244 | 746496;746496;746496;746496;746496 | 331776;331776;331776;331776;331776 | 581696;589504;592608;609664;605952 | |
420 | densenet1_stage4_conv7_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 1706.67 | 396949.33 | 8.70 | 6.02 | 479.88 | true | 0.087426;0.087414;0.087502;0.087414;0.087514 | 2399424;2399424;2399424;2399424;2399424 | 390144;400384;391936;399520;399392 | 4608;0;4608;0;512 | |
421 | densenet1_stage4_concat3 | Concat | [1,1200,7,7] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 0.00 | 5098.67 | 20.60 | 0.00 | 0.00 | true | 0.292298;0.119967;0.292096;0.119951;0.292914;0.119931;0.292147;0.119904;0.292221;0.119937 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;0;2816;0;0 | 384;9856;512;9728;384;9728;160;12416;512;9728 | |
421 | densenet1_stage4_concat3 | Concat | [1,1200,7,7] | 21.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 0.00 | 5098.67 | 20.60 | 0.00 | 0.00 | true | 0.292298;0.119967;0.292096;0.119951;0.292914;0.119931;0.292147;0.119904;0.292221;0.119937 | 0;0;0;0;0;0;0;0;0;0 | 384;9856;512;9728;384;9728;160;12416;512;9728 | 0;0;0;0;0;0;0;2816;0;0 | |
422 | densenet1_stage4_batchnorm8_fwd | BatchNorm | [1,1248,7,7] | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 428064 | 19968.00 | 413216.00 | 20.10 | 0.99 | 71.34 | true | 0.197312;0.210390;0.197579;0.195922;0.207853 | 428064;428064;428064;428064;428064 | 19968;19968;19968;19968;19968 | 411200;414624;416576;412192;412832 | |
423 | densenet1_stage4_relu8_fwd | Activation | [1,1248,7,7] | 18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 122304 | 0.00 | 6432.00 | 38.10 | 19.01 | 30.58 | false | 0.380501;0.381883;0.380179;0.382811;0.379877 | 122304;122304;122304;122304;122304 | 0;0;0;0;0 | 6400;6528;6432;6464;6400 | |
424 | densenet1_stage4_conv8_fwd | Convolution | [1,1248,7,7] | 1705 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 80.67 | 30680256 | 958549.33 | 28181.33 | 3.10 | 31.09 | 380.33 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 30680256;30680256;30680256;30680256;30680256 | 28096;27840;28224;28224;28224 | 958720;963584;958464;958464;958464 | |
425 | densenet1_stage4_batchnorm9_fwd | BatchNorm | [1,192,7,7] | 51.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 36245.33 | 6.80 | 1.67 | 16.46 | true | 0.068372;0.068352;0.068580;0.067797;0.068484 | 65856;65856;65856;65856;65856 | 3072;5120;3072;3072;3072 | 36384;35744;36128;36224;36608 | |
426 | densenet1_stage4_relu9_fwd | Activation | [1,192,7,7] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 1418.67 | 28.30 | 11.45 | 4.70 | true | 0.262275;0.269008;0.288799;0.292904;0.290272 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 1408;2048;1408;1440;1184 | |
427 | densenet1_stage4_conv9_fwd | Convolution | [1,192,7,7] | 757.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 365280.00 | 6.20 | 310.83 | 4366.97 | false | 0.062455;0.062454;0.062454;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 376288;361024;352448;374464;360352 | 0;0;256;0;0 | |
427 | densenet1_stage4_conv9_fwd | Convolution | [1,192,7,7] | 757.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 0.00 | 558357.33 | 22.20 | 5.28 | 491.52 | true | 0.221724;0.221858;0.221673;0.221529;0.221711 | 2949120;2949120;2949120;2949120;2949120 | 0;0;0;0;0 | 567744;541152;580128;566176;503744 | |
427 | densenet1_stage4_conv9_fwd | Convolution | [1,192,7,7] | 757.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 333568.00 | 633152.00 | 12.30 | 0.77 | 149.30 | true | 0.123174;0.123188;0.123163;0.123115;0.123000 | 746496;746496;746496;746496;746496 | 331776;331776;337152;331776;338432 | 619392;659648;618656;620416;696096 | |
427 | densenet1_stage4_conv9_fwd | Convolution | [1,192,7,7] | 757.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 1941.33 | 399456.00 | 8.70 | 5.98 | 479.88 | true | 0.087163;0.087401;0.087207;0.087188;0.087188 | 2399424;2399424;2399424;2399424;2399424 | 64;64;5696;64;5952 | 400096;399840;398688;399840;387680 | |
428 | densenet1_stage4_concat4 | Concat | [1,1248,7,7] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 661.33 | 6746.67 | 21.00 | 0.00 | 0.00 | true | 0.300562;0.119908;0.301723;0.119904;0.300209;0.119913;0.299895;0.119957;0.299855;0.119865 | 0;0;0;0;0;0;0;0;0;0 | 1792;0;2304;0;3968;0;1536;0;640;0 | 512;9472;3456;9856;5504;9472;512;9984;2848;9728 | |
428 | densenet1_stage4_concat4 | Concat | [1,1248,7,7] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.57 | 0 | 661.33 | 6746.67 | 21.00 | 0.00 | 0.00 | true | 0.300562;0.119908;0.301723;0.119904;0.300209;0.119913;0.299895;0.119957;0.299855;0.119865 | 0;0;0;0;0;0;0;0;0;0 | 1792;0;2304;0;3968;0;1536;0;640;0 | 512;9472;3456;9856;5504;9472;512;9984;2848;9728 | |
429 | densenet1_stage4_batchnorm10_fwd | BatchNorm | [1,1296,7,7] | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 444528 | 20736.00 | 447136.00 | 20.20 | 0.95 | 74.09 | true | 0.201731;0.201787;0.203164;0.199851;0.215089 | 444528;444528;444528;444528;444528 | 20736;20736;20736;20736;20736 | 449568;445408;439456;449984;446432 | |
430 | densenet1_stage4_relu10_fwd | Activation | [1,1296,7,7] | 19.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 127008 | 0.00 | 5717.33 | 38.60 | 22.21 | 31.75 | false | 0.386059;0.385674;0.387120;0.386250;0.385640 | 127008;127008;127008;127008;127008 | 0;0;0;0;0 | 5632;5888;5632;6528;5248 | |
431 | densenet1_stage4_conv10_fwd | Convolution | [1,1296,7,7] | 1759 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 31859904 | 995328.00 | 29056.00 | 3.10 | 31.10 | 379.28 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 31859904;31859904;31859904;31859904;31859904 | 995328;995328;995328;995328;995328 | 29184;29056;29056;29056;28544 | |
432 | densenet1_stage4_batchnorm11_fwd | BatchNorm | [1,192,7,7] | 47 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 65856 | 3072.00 | 36810.67 | 6.80 | 1.65 | 15.20 | true | 0.068577;0.068550;0.068604;0.068318;0.068344 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;3072 | 36768;36768;36640;36896;37024 | |
433 | densenet1_stage4_relu11_fwd | Activation | [1,192,7,7] | 11 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 149.33 | 981.33 | 28.40 | 16.64 | 4.70 | true | 0.289715;0.299175;0.268458;0.267284;0.294666 | 18816;18816;18816;18816;18816 | 224;224;224;0;0 | 1024;896;1024;896;1024 | |
434 | densenet1_stage4_conv11_fwd | Convolution | [1,192,7,7] | 753.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 3328.00 | 369525.33 | 6.20 | 304.52 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 0;6656;0;5120;4864 | 361504;372832;367104;368640;376992 | |
434 | densenet1_stage4_conv11_fwd | Convolution | [1,192,7,7] | 753.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.33 | 2949120 | 0.00 | 599744.00 | 22.20 | 4.92 | 465.68 | true | 0.221744;0.221599;0.221809;0.223648;0.221884 | 2949120;2949120;2949120;2949120;2949120 | 256;0;0;0;0 | 600960;596608;559776;607648;601664 | |
434 | densenet1_stage4_conv11_fwd | Convolution | [1,192,7,7] | 753.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 632576.00 | 12.30 | 0.77 | 149.30 | true | 0.123172;0.123169;0.123158;0.123034;0.123167 | 746496;746496;746496;746496;746496 | 331776;331776;331776;331776;331776 | 631072;631040;665824;635616;626688 | |
434 | densenet1_stage4_conv11_fwd | Convolution | [1,192,7,7] | 753.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 1578.67 | 390400.00 | 8.70 | 6.12 | 479.88 | true | 0.087309;0.087475;0.087306;0.087268;0.087403 | 2399424;2399424;2399424;2399424;2399424 | 128;20352;0;0;4608 | 399488;375552;399872;388864;382848 | |
435 | densenet1_stage4_concat5 | Concat | [1,1296,7,7] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.57 | 0 | 0.00 | 8192.00 | 21.30 | 0.00 | 0.00 | true | 0.305805;0.119972;0.307346;0.119933;0.307089;0.119923;0.306525;0.119912;0.306335;0.119965 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;2816;0;0;0 | 6784;9600;4096;9856;3584;9856;8064;9600;5504;9600 | |
435 | densenet1_stage4_concat5 | Concat | [1,1296,7,7] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.57 | 0 | 0.00 | 8192.00 | 21.30 | 0.00 | 0.00 | true | 0.305805;0.119972;0.307346;0.119933;0.307089;0.119923;0.306525;0.119912;0.306335;0.119965 | 0;0;0;0;0;0;0;0;0;0 | 0;0;0;0;0;0;2816;0;0;0 | 6784;9600;4096;9856;3584;9856;8064;9600;5504;9600 | |
436 | densenet1_stage4_batchnorm12_fwd | BatchNorm | [1,1344,7,7] | 29 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 460992 | 21504.00 | 443413.33 | 20.40 | 0.99 | 76.83 | true | 0.196955;0.205724;0.202553;0.205162;0.202998 | 460992;460992;460992;460992;460992 | 21504;21504;21504;21504;21504 | 442304;447104;454592;440832;437728 | |
437 | densenet1_stage4_relu12_fwd | Activation | [1,1344,7,7] | 19.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 131712 | 0.00 | 6250.67 | 39.30 | 21.07 | 32.93 | false | 0.392906;0.393199;0.392639;0.392346;0.393450 | 131712;131712;131712;131712;131712 | 0;0;0;0;0 | 6400;6432;5760;5920;6656 | |
438 | densenet1_stage4_conv12_fwd | Convolution | [1,1344,7,7] | 1820 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 87.00 | 33039552 | 1032192.00 | 28096.00 | 3.10 | 31.16 | 379.76 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 33039552;33039552;33039552;33039552;33039552 | 28096;28096;28096;27968;28224 | 1032192;1032448;1032192;1032192;1032192 | |
439 | densenet1_stage4_batchnorm13_fwd | BatchNorm | [1,192,7,7] | 51 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 36682.67 | 6.80 | 1.66 | 16.46 | true | 0.068026;0.067499;0.068057;0.067780;0.068435 | 65856;65856;65856;65856;65856 | 36640;36640;36768;36640;36768 | 3072;3072;3072;3072;3072 | |
440 | densenet1_stage4_relu13_fwd | Activation | [1,192,7,7] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 1024.00 | 29.10 | 15.08 | 4.70 | true | 0.286598;0.291583;0.289842;0.291488;0.379139 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 1024;1024;1024;1152;896 | |
441 | densenet1_stage4_conv13_fwd | Convolution | [1,192,7,7] | 755 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 597.33 | 378677.33 | 6.20 | 299.36 | 4366.97 | false | 0.062455;0.062454;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 0;5120;0;0;1792 | 378464;400960;375904;374080;381664 | |
441 | densenet1_stage4_conv13_fwd | Convolution | [1,192,7,7] | 755 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 0.00 | 573461.33 | 22.20 | 5.14 | 491.52 | true | 0.222143;0.221750;0.221715;0.224349;0.221964 | 2949120;2949120;2949120;2949120;2949120 | 0;0;0;0;0 | 595232;544928;587552;552416;580416 | |
441 | densenet1_stage4_conv13_fwd | Convolution | [1,192,7,7] | 755 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 660213.33 | 12.30 | 0.75 | 149.30 | true | 0.123200;0.123268;0.123221;0.123253;0.123267 | 746496;746496;746496;746496;746496 | 645696;681792;633056;689760;653152 | 332032;331776;331776;331776;331776 | |
441 | densenet1_stage4_conv13_fwd | Convolution | [1,192,7,7] | 755 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 1088.00 | 399200.00 | 8.70 | 5.99 | 479.88 | true | 0.087467;0.087492;0.087504;0.087215;0.087300 | 2399424;2399424;2399424;2399424;2399424 | 64;5184;64;64;3136 | 400352;379488;399968;400512;397280 | |
442 | densenet1_stage4_concat6 | Concat | [1,1344,7,7] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 960.00 | 5616.00 | 21.60 | 0.00 | 0.00 | true | 0.311507;0.119916;0.312129;0.119906;0.311283;0.119955;0.311629;0.119909;0.311841;0.119863 | 0;0;0;0;0;0;0;0;0;0 | 1792;0;4224;0;1792;6656;1536;0;640;0 | 1280;9856;1056;9856;1408;19584;1568;9728;1280;15360 | |
442 | densenet1_stage4_concat6 | Concat | [1,1344,7,7] | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 960.00 | 5616.00 | 21.60 | 0.00 | 0.00 | true | 0.311507;0.119916;0.312129;0.119906;0.311283;0.119955;0.311629;0.119909;0.311841;0.119863 | 0;0;0;0;0;0;0;0;0;0 | 1792;0;4224;0;1792;6656;1536;0;640;0 | 1280;9856;1056;9856;1408;19584;1568;9728;1280;15360 | |
443 | densenet1_stage4_batchnorm14_fwd | BatchNorm | [1,1392,7,7] | 28.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.00 | 477456 | 22357.33 | 473322.67 | 20.70 | 0.96 | 79.58 | true | 0.201368;0.198000;0.210635;0.216693;0.208162 | 477456;477456;477456;477456;477456 | 22528;22272;25600;22272;22272 | 477824;477088;452864;476288;466592 | |
444 | densenet1_stage4_relu14_fwd | Activation | [1,1392,7,7] | 19.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 136416 | 0.00 | 5376.00 | 39.90 | 25.38 | 34.10 | false | 0.397968;0.398708;0.400985;0.400266;0.399073 | 136416;136416;136416;136416;136416 | 0;0;0;0;256 | 5888;5248;6144;4992;4992 | |
445 | densenet1_stage4_conv14_fwd | Convolution | [1,1392,7,7] | 1894.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 90.33 | 34219200 | 1069056.00 | 29269.33 | 3.10 | 31.16 | 378.81 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 34219200;34219200;34219200;34219200;34219200 | 1069312;1069056;1069056;1069056;1069056 | 29312;29056;29184;29312;29568 | |
446 | densenet1_stage4_batchnorm15_fwd | BatchNorm | [1,192,7,7] | 51.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 36618.67 | 6.80 | 1.66 | 16.46 | true | 0.067958;0.068075;0.068248;0.068055;0.067516 | 65856;65856;65856;65856;65856 | 36704;36768;36384;36352;36768 | 3072;3072;3072;3072;3072 | |
447 | densenet1_stage4_relu15_fwd | Activation | [1,192,7,7] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 149.33 | 1120.00 | 29.30 | 14.82 | 4.70 | true | 0.371636;0.291552;0.291023;0.293105;0.294628 | 18816;18816;18816;18816;18816 | 896;1152;1536;1312;896 | 224;224;224;0;0 | |
448 | densenet1_stage4_conv15_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 370336.00 | 6.20 | 306.59 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 0;0;0;0;0 | 373440;372480;378208;365088;361568 | |
448 | densenet1_stage4_conv15_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.67 | 2949120 | 0.00 | 615626.67 | 22.20 | 4.79 | 442.35 | true | 0.221709;0.221467;0.221848;0.221791;0.222013 | 2949120;2949120;2949120;2949120;2949120 | 256;0;0;0;0 | 614496;619392;612992;584128;626624 | |
448 | densenet1_stage4_conv15_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 650986.67 | 12.30 | 0.76 | 149.30 | true | 0.123214;0.123127;0.123163;0.123185;0.123183 | 746496;746496;746496;746496;746496 | 331776;331776;331776;331776;331776 | 648896;647680;648096;688352;655968 | |
448 | densenet1_stage4_conv15_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 0.00 | 399989.33 | 8.70 | 6.00 | 479.88 | true | 0.087478;0.087525;0.087096;0.087369;0.087427 | 2399424;2399424;2399424;2399424;2399424 | 0;0;0;0;0 | 399776;399776;400288;400544;399904 | |
449 | densenet1_stage4_concat7 | Concat | [1,1392,7,7] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 1429.33 | 13674.67 | 21.60 | 0.00 | 0.00 | true | 0.312497;0.119914;0.312473;0.119941;0.312367;0.119856;0.312647;0.119858;0.315408;0.119898 | 0;0;0;0;0;0;0;0;0;0 | 3456;0;3456;0;2688;0;3328;0;2560;0 | 17792;9888;16512;9568;18176;9856;19328;9600;18432;9824 | |
449 | densenet1_stage4_concat7 | Concat | [1,1392,7,7] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 1429.33 | 13674.67 | 21.60 | 0.00 | 0.00 | true | 0.312497;0.119914;0.312473;0.119941;0.312367;0.119856;0.312647;0.119858;0.315408;0.119898 | 0;0;0;0;0;0;0;0;0;0 | 3456;0;3456;0;2688;0;3328;0;2560;0 | 17792;9888;16512;9568;18176;9856;19328;9600;18432;9824 | |
450 | densenet1_stage4_batchnorm16_fwd | BatchNorm | [1,1440,7,7] | 29.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 493920 | 23040.00 | 458997.33 | 20.60 | 1.02 | 70.56 | true | 0.205505;0.210325;0.200419;0.209214;0.202543 | 493920;493920;493920;493920;493920 | 23040;23040;23040;23040;23040 | 459104;459392;469792;458496;454752 | |
451 | densenet1_stage4_relu16_fwd | Activation | [1,1440,7,7] | 20.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 141120 | 3413.33 | 6624.00 | 40.60 | 14.06 | 35.28 | true | 0.405228;0.406175;0.406644;0.405167;0.406738 | 141120;141120;141120;141120;141120 | 0;5632;0;5120;5120 | 7552;6304;6528;6560;6784 | |
452 | densenet1_stage4_conv16_fwd | Convolution | [1,1440,7,7] | 1955 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 93.00 | 35398848 | 1105920.00 | 28096.00 | 3.10 | 31.22 | 380.63 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 35398848;35398848;35398848;35398848;35398848 | 1105920;1105920;1105920;1105920;1105920 | 27840;28096;28480;27840;28352 | |
453 | densenet1_stage4_batchnorm17_fwd | BatchNorm | [1,192,7,7] | 49.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 33813.33 | 6.80 | 1.79 | 16.46 | true | 0.068577;0.068047;0.068187;0.068350;0.068323 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;3072 | 33632;33696;33824;33920;34592 | |
454 | densenet1_stage4_relu17_fwd | Activation | [1,192,7,7] | 13.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 3808.00 | 29.10 | 4.67 | 4.70 | true | 0.291074;0.290408;0.291521;0.289789;0.294164 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 3968;4096;3712;3744;3200 | |
455 | densenet1_stage4_conv17_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 382186.67 | 6.20 | 297.08 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 256;0;0;0;0 | 383168;388160;380352;372320;383040 | |
455 | densenet1_stage4_conv17_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 0.00 | 591104.00 | 22.20 | 4.99 | 491.52 | true | 0.220290;0.221442;0.223898;0.221750;0.222013 | 2949120;2949120;2949120;2949120;2949120 | 0;0;0;0;0 | 608064;600064;579296;529472;593952 | |
455 | densenet1_stage4_conv17_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 693056.00 | 12.30 | 0.73 | 149.30 | true | 0.123285;0.123235;0.123198;0.123108;0.123124 | 746496;746496;746496;746496;746496 | 680736;687552;699744;738656;691872 | 331776;331776;331776;331776;331776 | |
455 | densenet1_stage4_conv17_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 661.33 | 399978.67 | 8.80 | 5.99 | 479.88 | true | 0.087608;0.087506;0.087608;0.087469;0.087236 | 2399424;2399424;2399424;2399424;2399424 | 64;64;1856;5696;64 | 400128;399712;400480;387680;400096 | |
456 | densenet1_stage4_concat8 | Concat | [1,1440,7,7] | 23.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 1024.00 | 6485.33 | 21.90 | 0.00 | 0.00 | true | 0.320044;0.119920;0.319734;0.119920;0.318278;0.119952;0.318563;0.119959;0.319599;0.119896 | 0;0;0;0;0;0;0;0;0;0 | 3584;256;3328;0;2432;0;2176;0;1280;0 | 16512;5760;896;9856;1280;9728;2816;9888;896;9472 | |
456 | densenet1_stage4_concat8 | Concat | [1,1440,7,7] | 23.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 1024.00 | 6485.33 | 21.90 | 0.00 | 0.00 | true | 0.320044;0.119920;0.319734;0.119920;0.318278;0.119952;0.318563;0.119959;0.319599;0.119896 | 0;0;0;0;0;0;0;0;0;0 | 3584;256;3328;0;2432;0;2176;0;1280;0 | 16512;5760;896;9856;1280;9728;2816;9888;896;9472 | |
457 | densenet1_stage4_batchnorm18_fwd | BatchNorm | [1,1488,7,7] | 30 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.67 | 510384 | 23808.00 | 511808.00 | 21.10 | 0.95 | 76.55 | true | 0.213245;0.206973;0.217016;0.207952;0.210563 | 510384;510384;510384;510384;510384 | 23808;23808;23808;23808;23808 | 474016;518816;505024;512960;517440 | |
458 | densenet1_stage4_relu18_fwd | Activation | [1,1488,7,7] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 145824 | 0.00 | 5546.67 | 41.20 | 26.29 | 36.46 | false | 0.411709;0.410500;0.411714;0.412532;0.411174 | 145824;145824;145824;145824;145824 | 0;0;0;0;0 | 5504;5632;5760;4736;5504 | |
459 | densenet1_stage4_conv18_fwd | Convolution | [1,1488,7,7] | 2024.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 96.33 | 36578496 | 1142784.00 | 29098.67 | 3.10 | 31.21 | 379.71 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 36578496;36578496;36578496;36578496;36578496 | 1143040;1142784;1142784;1142784;1142784 | 29056;29056;29056;29184;29216 | |
460 | densenet1_stage4_batchnorm19_fwd | BatchNorm | [1,192,7,7] | 49.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 36768.00 | 6.80 | 1.65 | 16.46 | true | 0.068371;0.068176;0.068536;0.068508;0.068543 | 65856;65856;65856;65856;65856 | 36768;36512;36896;37024;36640 | 3072;3072;4864;3072;3072 | |
461 | densenet1_stage4_relu19_fwd | Activation | [1,192,7,7] | 16.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 149.33 | 1194.67 | 29.40 | 14.00 | 4.70 | true | 0.292765;0.283916;0.295650;0.296885;0.294471 | 18816;18816;18816;18816;18816 | 1152;1152;1920;896;1280 | 224;224;5856;0;0 | |
462 | densenet1_stage4_conv19_fwd | Convolution | [1,192,7,7] | 754.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 378517.33 | 6.20 | 299.96 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 0;0;0;0;0 | 377120;378784;378080;378688;379936 | |
462 | densenet1_stage4_conv19_fwd | Convolution | [1,192,7,7] | 754.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.33 | 2949120 | 85.33 | 636064.00 | 22.20 | 4.64 | 465.68 | true | 0.221815;0.221466;0.221457;0.221853;0.221849 | 2949120;2949120;2949120;2949120;2949120 | 600672;637184;634752;638976;636256 | 256;0;0;0;5376 | |
462 | densenet1_stage4_conv19_fwd | Convolution | [1,192,7,7] | 754.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 681824.00 | 12.30 | 0.74 | 149.30 | true | 0.123094;0.123177;0.123134;0.123155;0.122952 | 746496;746496;746496;746496;746496 | 331776;334080;331776;331776;331776 | 719712;675392;679968;680800;684704 | |
462 | densenet1_stage4_conv19_fwd | Convolution | [1,192,7,7] | 754.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 0.00 | 399872.00 | 8.70 | 6.00 | 479.88 | true | 0.087590;0.087688;0.087389;0.087416;0.087463 | 2399424;2399424;2399424;2399424;2399424 | 0;3072;0;0;0 | 399744;396032;400000;400384;399872 | |
463 | densenet1_stage4_concat9 | Concat | [1,1488,7,7] | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 64.00 | 13845.33 | 22.30 | 0.00 | 0.00 | true | 0.325241;0.119976;0.325335;0.119963;0.325936;0.119913;0.326528;0.120011;0.325203;0.119938 | 0;0;0;0;0;0;0;0;0;0 | 7040;19328;6560;19328;14080;15616;7680;19328;7040;19328 | 128;0;128;0;128;0;1152;0;1152;0 | |
463 | densenet1_stage4_concat9 | Concat | [1,1488,7,7] | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 64.00 | 13845.33 | 22.30 | 0.00 | 0.00 | true | 0.325241;0.119976;0.325335;0.119963;0.325936;0.119913;0.326528;0.120011;0.325203;0.119938 | 0;0;0;0;0;0;0;0;0;0 | 128;0;128;0;128;0;1152;0;1152;0 | 7040;19328;6560;19328;14080;15616;7680;19328;7040;19328 | |
464 | densenet1_stage4_batchnorm20_fwd | BatchNorm | [1,1536,7,7] | 29.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.67 | 526848 | 24576.00 | 554304.00 | 21.30 | 0.91 | 79.02 | true | 0.213297;0.203981;0.215605;0.212487;0.212952 | 526848;526848;526848;526848;526848 | 554240;554624;551712;555168;554048 | 24576;24576;24576;24576;24576 | |
465 | densenet1_stage4_relu20_fwd | Activation | [1,1536,7,7] | 21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 150528 | 0.00 | 6826.67 | 41.90 | 22.05 | 37.63 | false | 0.419738;0.418347;0.439423;0.418519;0.418128 | 150528;150528;150528;150528;150528 | 0;0;0;0;0 | 7808;6432;6816;6880;6784 | |
466 | densenet1_stage4_conv20_fwd | Convolution | [1,1536,7,7] | 2098 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 98.67 | 37758144 | 1179648.00 | 29354.67 | 3.10 | 31.23 | 382.68 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37758144;37758144;37758144;37758144;37758144 | 1179648;1179648;1179648;1179648;1179648 | 29184;29504;29248;29312;29696 | |
467 | densenet1_stage4_batchnorm21_fwd | BatchNorm | [1,192,7,7] | 53.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.33 | 65856 | 3072.00 | 36458.67 | 6.90 | 1.67 | 15.20 | true | 0.068660;0.068587;0.068550;0.068698;0.068609 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;3072 | 36480;36256;36576;36512;36384 | |
468 | densenet1_stage4_relu21_fwd | Activation | [1,192,7,7] | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 1237.33 | 29.00 | 12.88 | 4.70 | true | 0.287521;0.290160;0.290134;0.291092;0.290041 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 1024;2176;1152;1280;1280 | |
469 | densenet1_stage4_conv21_fwd | Convolution | [1,192,7,7] | 756.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 374080.00 | 6.20 | 303.52 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 0;0;0;2048;0 | 377440;376896;383168;363616;367904 | |
469 | densenet1_stage4_conv21_fwd | Convolution | [1,192,7,7] | 756.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 0.00 | 616064.00 | 22.20 | 4.79 | 491.52 | true | 0.221978;0.221714;0.221861;0.221877;0.221955 | 2949120;2949120;2949120;2949120;2949120 | 0;0;0;0;0 | 608448;629408;592224;617824;621920 | |
469 | densenet1_stage4_conv21_fwd | Convolution | [1,192,7,7] | 756.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 727744.00 | 12.30 | 0.70 | 149.30 | true | 0.123167;0.123277;0.123225;0.123084;0.123236 | 746496;746496;746496;746496;746496 | 717504;718048;755360;736704;728480 | 331776;331776;331776;336896;331776 | |
469 | densenet1_stage4_conv21_fwd | Convolution | [1,192,7,7] | 756.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 64.00 | 397834.67 | 8.70 | 6.03 | 479.88 | true | 0.087106;0.087116;0.087281;0.087250;0.087254 | 2399424;2399424;2399424;2399424;2399424 | 64;64;64;4672;64 | 388192;399456;400096;395104;398944 | |
470 | densenet1_stage4_concat10 | Concat | [1,1536,7,7] | 24.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 2218.67 | 8864.00 | 22.50 | 0.00 | 0.00 | true | 0.330400;0.119897;0.330982;0.119911;0.331106;0.119959;0.330015;0.119959;0.329986;0.119867 | 0;0;0;0;0;0;0;0;0;0 | 8768;9728;5568;9856;6016;9728;5056;9856;9472;9472 | 11264;0;4224;0;4864;0;4608;0;2432;2048 | |
470 | densenet1_stage4_concat10 | Concat | [1,1536,7,7] | 24.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 2218.67 | 8864.00 | 22.50 | 0.00 | 0.00 | true | 0.330400;0.119897;0.330982;0.119911;0.331106;0.119959;0.330015;0.119959;0.329986;0.119867 | 0;0;0;0;0;0;0;0;0;0 | 8768;9728;5568;9856;6016;9728;5056;9856;9472;9472 | 11264;0;4224;0;4864;0;4608;0;2432;2048 | |
471 | densenet1_stage4_batchnorm22_fwd | BatchNorm | [1,1584,7,7] | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 543312 | 25344.00 | 525322.67 | 21.20 | 0.99 | 77.62 | true | 0.212942;0.208329;0.218206;0.214051;0.206462 | 543312;543312;543312;543312;543312 | 25344;25344;25344;32256;25344 | 521408;527808;531936;526752;511296 | |
472 | densenet1_stage4_relu22_fwd | Activation | [1,1584,7,7] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 155232 | 0.00 | 31648.00 | 42.40 | 4.90 | 35.83 | true | 0.424937;0.423224;0.424268;0.423985;0.423692 | 155232;155232;155232;155232;155232 | 0;0;0;0;0 | 33184;32416;32032;30496;30496 | |
473 | densenet1_stage4_conv22_fwd | Convolution | [1,1584,7,7] | 2154 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 101.00 | 38937792 | 1216512.00 | 26720.00 | 3.10 | 31.32 | 385.52 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 38937792;38937792;38937792;38937792;38937792 | 1216768;1216512;1216512;1216512;1216512 | 27264;25728;27776;26784;26112 | |
474 | densenet1_stage4_batchnorm23_fwd | BatchNorm | [1,192,7,7] | 51.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 35722.67 | 6.90 | 1.70 | 16.46 | true | 0.068428;0.064673;0.069114;0.068648;0.068691 | 65856;65856;65856;65856;65856 | 8448;3072;3072;3072;3072 | 35168;36512;36384;35616;34688 | |
475 | densenet1_stage4_relu23_fwd | Activation | [1,192,7,7] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 149.33 | 4992.00 | 29.50 | 3.66 | 4.70 | true | 0.299270;0.290500;0.286178;0.385162;0.294685 | 18816;18816;18816;18816;18816 | 224;224;224;0;0 | 5120;4864;4352;4992;6560 | |
476 | densenet1_stage4_conv23_fwd | Convolution | [1,192,7,7] | 755.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 385493.33 | 6.20 | 294.53 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 512;0;0;0;0 | 390048;387744;401600;378688;373600 | |
476 | densenet1_stage4_conv23_fwd | Convolution | [1,192,7,7] | 755.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 0.00 | 650624.00 | 22.20 | 4.53 | 491.52 | true | 0.221886;0.221853;0.222056;0.221991;0.221636 | 2949120;2949120;2949120;2949120;2949120 | 256;0;0;0;0 | 612640;624608;672256;662880;664384 | |
476 | densenet1_stage4_conv23_fwd | Convolution | [1,192,7,7] | 755.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 739872.00 | 12.30 | 0.70 | 149.30 | true | 0.123141;0.123160;0.123172;0.123208;0.123313 | 746496;746496;746496;746496;746496 | 331776;331776;331776;331776;331776 | 771360;759808;723616;728384;731424 | |
476 | densenet1_stage4_conv23_fwd | Convolution | [1,192,7,7] | 755.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 4352.00 | 394016.00 | 8.70 | 6.02 | 479.88 | true | 0.087173;0.087172;0.087161;0.087143;0.087256 | 2399424;2399424;2399424;2399424;2399424 | 399520;398880;384544;398368;384800 | 512;0;12544;0;12800 | |
477 | densenet1_stage4_concat11 | Concat | [1,1584,7,7] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 5205.33 | 14032.00 | 21.00 | 0.00 | 0.00 | true | 0.298150;0.119950;0.298846;0.119991;0.303666;0.119922;0.301170;0.119918;0.302395;0.119939 | 0;0;0;0;0;0;0;0;0;0 | 11776;0;11520;0;16640;0;9728;0;9984;0 | 4352;19456;5024;22400;9120;19328;4416;19328;15648;15744 | |
477 | densenet1_stage4_concat11 | Concat | [1,1584,7,7] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 5205.33 | 14032.00 | 21.00 | 0.00 | 0.00 | true | 0.298150;0.119950;0.298846;0.119991;0.303666;0.119922;0.301170;0.119918;0.302395;0.119939 | 0;0;0;0;0;0;0;0;0;0 | 4352;19456;5024;22400;9120;19328;4416;19328;15648;15744 | 11776;0;11520;0;16640;0;9728;0;9984;0 | |
478 | densenet1_stage4_batchnorm24_fwd | BatchNorm | [1,1632,7,7] | 31.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 6.33 | 559776 | 26112.00 | 595498.67 | 20.70 | 0.90 | 88.39 | true | 0.224840;0.206419;0.211597;0.201467;0.202959 | 559776;559776;559776;559776;559776 | 26112;26112;30976;26112;26112 | 599232;592000;595264;599392;588448 | |
479 | densenet1_stage4_relu24_fwd | Activation | [1,1632,7,7] | 22 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 159936 | 0.00 | 6965.33 | 43.00 | 22.96 | 39.98 | false | 0.431335;0.431774;0.428447;0.429786;0.429714 | 159936;159936;159936;159936;159936 | 6528;7328;7552;6528;7040 | 0;0;0;0;0 | |
480 | densenet1_stage4_conv24_fwd | Convolution | [1,1632,7,7] | 2303.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 104.67 | 40117440 | 1253376.00 | 29120.00 | 3.10 | 31.28 | 383.29 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 40117440;40117440;40117440;40117440;40117440 | 1253632;1253376;1253376;1253376;1253376 | 29120;29120;29120;29248;29120 | |
481 | densenet1_stage4_batchnorm25_fwd | BatchNorm | [1,192,7,7] | 61.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 36298.67 | 6.90 | 1.67 | 16.46 | true | 0.068549;0.068559;0.068643;0.068572;0.068390 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;3072 | 36256;36064;36288;36352;36640 | |
482 | densenet1_stage4_relu25_fwd | Activation | [1,192,7,7] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 18816 | 224.00 | 1290.67 | 29.20 | 12.42 | 4.34 | true | 0.288401;0.291061;0.292502;0.292634;0.291691 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 1408;1664;1152;1312;1152 | |
483 | densenet1_stage4_conv25_fwd | Convolution | [1,192,7,7] | 788.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 384096.00 | 6.20 | 295.61 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 0;0;0;0;0 | 377888;389056;385344;375616;390720 | |
483 | densenet1_stage4_conv25_fwd | Convolution | [1,192,7,7] | 788.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 0.00 | 655562.67 | 22.20 | 4.50 | 491.52 | true | 0.221912;0.221896;0.221990;0.221746;0.222030 | 2949120;2949120;2949120;2949120;2949120 | 0;0;0;0;0 | 632256;665984;665344;647296;654048 | |
483 | densenet1_stage4_conv25_fwd | Convolution | [1,192,7,7] | 788.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 769216.00 | 12.30 | 0.68 | 149.30 | true | 0.123294;0.123177;0.123070;0.123313;0.123264 | 746496;746496;746496;746496;746496 | 331776;331776;331776;335872;331776 | 797024;745888;767936;783232;756480 | |
483 | densenet1_stage4_conv25_fwd | Convolution | [1,192,7,7] | 788.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 64.00 | 399370.67 | 8.70 | 6.01 | 479.88 | true | 0.087531;0.087443;0.087501;0.087568;0.087445 | 2399424;2399424;2399424;2399424;2399424 | 64;64;3136;64;64 | 399712;399584;396768;399328;399200 | |
484 | densenet1_stage4_concat12 | Concat | [1,1632,7,7] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.57 | 0 | 2218.67 | 7333.33 | 22.90 | 0.00 | 0.00 | true | 0.338279;0.119944;0.338074;0.119927;0.337410;0.119942;0.338082;0.119966;0.337239;0.119921 | 0;0;0;0;0;0;0;0;0;0 | 5632;0;12288;256;5376;0;4352;0;3328;0 | 4448;9856;15808;6016;4224;9728;3712;10240;3584;9728 | |
484 | densenet1_stage4_concat12 | Concat | [1,1632,7,7] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.43 | 0 | 2218.67 | 7333.33 | 22.90 | 0.00 | 0.00 | true | 0.338279;0.119944;0.338074;0.119927;0.337410;0.119942;0.338082;0.119966;0.337239;0.119921 | 0;0;0;0;0;0;0;0;0;0 | 5632;0;12288;256;5376;0;4352;0;3328;0 | 4448;9856;15808;6016;4224;9728;3712;10240;3584;9728 | |
485 | densenet1_stage4_batchnorm26_fwd | BatchNorm | [1,1680,7,7] | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 576240 | 26880.00 | 556821.33 | 20.60 | 0.99 | 82.32 | true | 0.211990;0.207562;0.202501;0.200895;0.208204 | 576240;576240;576240;576240;576240 | 26880;26880;26880;26880;26880 | 559744;539552;557152;564768;553568 | |
486 | densenet1_stage4_relu26_fwd | Activation | [1,1680,7,7] | 23.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 164640 | 0.00 | 31669.33 | 43.60 | 5.20 | 41.16 | true | 0.435275;0.435098;0.437951;0.439234;0.435961 | 164640;164640;164640;164640;164640 | 0;0;0;0;0 | 31104;29984;31776;32128;33024 | |
487 | densenet1_stage4_conv26_fwd | Convolution | [1,1680,7,7] | 2341.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 107.33 | 41297088 | 1290325.33 | 27744.00 | 3.10 | 31.33 | 384.76 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 41297088;41297088;41297088;41297088;41297088 | 1290496;1290240;1294080;1290240;1290240 | 27904;27520;27808;27904;27136 | |
488 | densenet1_stage4_batchnorm27_fwd | BatchNorm | [1,192,7,7] | 52.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 34933.33 | 6.80 | 1.73 | 16.46 | true | 0.068260;0.068253;0.068448;0.068446;0.068607 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;5376 | 34976;34848;34976;35104;34304 | |
489 | densenet1_stage4_relu27_fwd | Activation | [1,192,7,7] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 5333.33 | 29.40 | 3.39 | 4.70 | true | 0.291830;0.306020;0.286162;0.294251;0.294537 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 5504;4608;5248;5248;6560 | |
490 | densenet1_stage4_conv27_fwd | Convolution | [1,192,7,7] | 782.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.33 | 113541120 | 0.00 | 388853.33 | 6.20 | 291.99 | 4311.74 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 400768;391904;384608;389920;384736 | 7168;0;0;0;0 | |
490 | densenet1_stage4_conv27_fwd | Convolution | [1,192,7,7] | 782.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.67 | 2949120 | 0.00 | 693077.33 | 22.20 | 4.26 | 442.35 | true | 0.221743;0.221744;0.221732;0.221799;0.221754 | 2949120;2949120;2949120;2949120;2949120 | 256;0;0;0;0 | 687488;664224;694048;699232;697696 | |
490 | densenet1_stage4_conv27_fwd | Convolution | [1,192,7,7] | 782.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.33 | 2399424 | 682.67 | 399178.67 | 8.70 | 6.00 | 449.92 | true | 0.087421;0.087263;0.087419;0.087295;0.087496 | 2399424;2399424;2399424;2399424;2399424 | 363808;399072;399296;399168;399840 | 18688;1792;256;0;0 | |
490 | densenet1_stage4_conv27_fwd | Convolution | [1,192,7,7] | 782.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 770912.00 | 12.30 | 0.68 | 149.30 | true | 0.123014;0.123102;0.123311;0.123218;0.123296 | 746496;746496;746496;746496;746496 | 775584;802176;767264;757280;769888 | 331776;331776;331776;331776;331776 | |
491 | densenet1_stage4_concat13 | Concat | [1,1680,7,7] | 24.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 17877.33 | 21269.33 | 21.30 | 0.00 | 0.00 | true | 0.301871;0.119888;0.307771;0.119893;0.318182;0.119869;0.311968;0.119864;0.307354;0.119872 | 0;0;0;0;0;0;0;0;0;0 | 44160;0;38656;0;35584;0;35840;0;35840;0 | 25472;19136;22656;19328;26112;19456;23296;19232;23552;19328 | |
491 | densenet1_stage4_concat13 | Concat | [1,1680,7,7] | 24.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.57 | 0 | 17877.33 | 21269.33 | 21.30 | 0.00 | 0.00 | true | 0.301871;0.119888;0.307771;0.119893;0.318182;0.119869;0.311968;0.119864;0.307354;0.119872 | 0;0;0;0;0;0;0;0;0;0 | 44160;0;38656;0;35584;0;35840;0;35840;0 | 25472;19136;22656;19328;26112;19456;23296;19232;23552;19328 | |
492 | densenet1_stage4_batchnorm28_fwd | BatchNorm | [1,1728,7,7] | 38 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 592704 | 27648.00 | 621877.33 | 21.30 | 0.91 | 84.67 | true | 0.221454;0.209019;0.220509;0.206999;0.210439 | 592704;592704;592704;592704;592704 | 27648;27648;27648;27648;27648 | 620800;613824;620928;623904;624448 | |
493 | densenet1_stage4_relu28_fwd | Activation | [1,1728,7,7] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 169344 | 0.00 | 7520.00 | 44.20 | 22.52 | 42.34 | false | 0.442651;0.442171;0.441201;0.441982;0.442677 | 169344;169344;169344;169344;169344 | 0;0;0;0;1792 | 7584;8480;6656;7680;7296 | |
494 | densenet1_stage4_conv28_fwd | Convolution | [1,1728,7,7] | 2461 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 110.00 | 42476736 | 1327104.00 | 28224.00 | 3.10 | 31.34 | 386.15 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 42476736;42476736;42476736;42476736;42476736 | 28224;28480;28224;28224;28224 | 1327360;1327104;1327104;1327104;1327104 | |
495 | densenet1_stage4_batchnorm29_fwd | BatchNorm | [1,192,7,7] | 58.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 37066.67 | 6.90 | 1.64 | 16.46 | true | 0.068667;0.068706;0.068893;0.068369;0.068662 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;3072 | 37664;36384;37024;36928;37248 | |
496 | densenet1_stage4_relu29_fwd | Activation | [1,192,7,7] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 1717.33 | 29.00 | 9.69 | 4.70 | true | 0.286554;0.290845;0.289669;0.290769;0.292972 | 18816;18816;18816;18816;18816 | 1792;1920;1664;1696;1312 | 224;224;224;224;224 | |
497 | densenet1_stage4_conv29_fwd | Convolution | [1,192,7,7] | 783.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 21.33 | 393664.00 | 6.20 | 288.41 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 398720;393280;389312;395200;392512 | 576;0;0;64;0 | |
497 | densenet1_stage4_conv29_fwd | Convolution | [1,192,7,7] | 783.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.67 | 2949120 | 0.00 | 705002.67 | 22.20 | 4.18 | 442.35 | true | 0.221865;0.221505;0.221590;0.223548;0.222038 | 2949120;2949120;2949120;2949120;2949120 | 0;0;0;0;0 | 693728;709440;704896;712896;700672 | |
497 | densenet1_stage4_conv29_fwd | Convolution | [1,192,7,7] | 783.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 787605.33 | 12.30 | 0.67 | 149.30 | true | 0.123189;0.123211;0.123030;0.123217;0.123099 | 746496;746496;746496;746496;746496 | 331776;331776;331776;331776;331776 | 783392;777056;788448;797632;790976 | |
497 | densenet1_stage4_conv29_fwd | Convolution | [1,192,7,7] | 783.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 384.00 | 399125.33 | 8.70 | 6.01 | 479.88 | true | 0.087723;0.087513;0.087518;0.087423;0.087359 | 2399424;2399424;2399424;2399424;2399424 | 1024;64;64;0;1088 | 398624;399584;399712;399168;398176 | |
498 | densenet1_stage4_concat14 | Concat | [1,1728,7,7] | 27.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 17237.33 | 8533.33 | 23.40 | 0.00 | 0.00 | true | 0.350835;0.119899;0.349613;0.119936;0.350895;0.119957;0.348726;0.119937;0.348468;0.119884 | 0;0;0;0;0;0;0;0;0;0 | 7264;9856;6688;9984;7968;9600;4896;9728;6912;9728 | 35968;0;35584;0;35072;0;34432;0;33920;0 | |
498 | densenet1_stage4_concat14 | Concat | [1,1728,7,7] | 27.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 17237.33 | 8533.33 | 23.40 | 0.00 | 0.00 | true | 0.350835;0.119899;0.349613;0.119936;0.350895;0.119957;0.348726;0.119937;0.348468;0.119884 | 0;0;0;0;0;0;0;0;0;0 | 35968;0;35584;0;35072;0;34432;0;33920;0 | 7264;9856;6688;9984;7968;9600;4896;9728;6912;9728 | |
499 | densenet1_stage4_batchnorm30_fwd | BatchNorm | [1,1776,7,7] | 44.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 609168 | 28416.00 | 586645.33 | 20.70 | 0.99 | 87.02 | true | 0.204693;0.212950;0.206408;0.207166;0.207723 | 609168;609168;609168;609168;609168 | 28416;28416;28416;28416;28416 | 585216;581216;586368;592544;588352 | |
500 | densenet1_stage4_relu30_fwd | Activation | [1,1776,7,7] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 174048 | 0.00 | 32501.33 | 44.80 | 5.36 | 43.51 | true | 0.448617;0.448401;0.449916;0.448265;0.447100 | 174048;174048;174048;174048;174048 | 0;0;0;0;0 | 30112;32672;32832;34720;32000 | |
501 | densenet1_stage4_conv30_fwd | Convolution | [1,1776,7,7] | 2443.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 113.00 | 43656384 | 1363968.00 | 27061.33 | 3.10 | 31.38 | 386.34 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 43656384;43656384;43656384;43656384;43656384 | 1364224;1363968;1363968;1363968;1363968 | 27136;27040;26880;27008;27520 | |
502 | densenet1_stage4_batchnorm31_fwd | BatchNorm | [1,192,7,7] | 53.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 36458.67 | 6.70 | 1.67 | 16.46 | true | 0.065012;0.068423;0.065501;0.068406;0.066664 | 65856;65856;65856;65856;65856 | 36128;36512;37024;36256;36608 | 3072;4096;3072;3072;3072 | |
503 | densenet1_stage4_relu31_fwd | Activation | [1,192,7,7] | 15.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 18816 | 224.00 | 3712.00 | 29.40 | 4.78 | 4.34 | true | 0.294558;0.301020;0.291401;0.294619;0.293782 | 18816;18816;18816;18816;18816 | 3712;3712;4096;3712;3616 | 224;224;224;224;224 | |
504 | densenet1_stage4_conv31_fwd | Convolution | [1,192,7,7] | 757 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 399882.67 | 6.20 | 283.94 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 2304;0;0;0;0 | 393664;400352;402272;397024;414240 | |
504 | densenet1_stage4_conv31_fwd | Convolution | [1,192,7,7] | 757 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 0.00 | 722368.00 | 22.20 | 4.08 | 491.52 | true | 0.221878;0.221559;0.222027;0.221824;0.226174 | 2949120;2949120;2949120;2949120;2949120 | 256;0;0;0;0 | 726784;718656;722848;720256;724000 | |
504 | densenet1_stage4_conv31_fwd | Convolution | [1,192,7,7] | 757 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.33 | 746496 | 331776.00 | 806432.00 | 12.30 | 0.66 | 139.98 | true | 0.123215;0.123267;0.123257;0.123140;0.123226 | 746496;746496;746496;746496;746496 | 331776;331776;331776;331776;331776 | 798848;808352;808448;804832;806112 | |
504 | densenet1_stage4_conv31_fwd | Convolution | [1,192,7,7] | 757 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 1408.00 | 398058.67 | 8.70 | 6.01 | 479.88 | true | 0.087227;0.087316;0.087116;0.087136;0.087231 | 2399424;2399424;2399424;2399424;2399424 | 4224;0;0;0;11008 | 396736;399040;398400;399712;382368 | |
505 | densenet1_stage4_concat15 | Concat | [1,1776,7,7] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 21162.67 | 14789.33 | 21.60 | 0.00 | 0.00 | true | 0.317260;0.119951;0.312832;0.119959;0.311359;0.119911;0.316989;0.119910;0.314764;0.119910 | 0;0;0;0;0;0;0;0;0;0 | 7168;19488;6944;19360;6304;19232;16544;36384;6560;19616 | 39552;0;38528;0;38400;0;44800;10496;43136;0 | |
505 | densenet1_stage4_concat15 | Concat | [1,1776,7,7] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 21162.67 | 14789.33 | 21.60 | 0.00 | 0.00 | true | 0.317260;0.119951;0.312832;0.119959;0.311359;0.119911;0.316989;0.119910;0.314764;0.119910 | 0;0;0;0;0;0;0;0;0;0 | 7168;19488;6944;19360;6304;19232;16544;36384;6560;19616 | 39552;0;38528;0;38400;0;44800;10496;43136;0 | |
506 | densenet1_stage4_batchnorm32_fwd | BatchNorm | [1,1824,7,7] | 34 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 625632 | 29184.00 | 671904.00 | 20.20 | 0.89 | 89.38 | true | 0.200646;0.202884;0.204185;0.198890;0.201047 | 625632;625632;625632;625632;625632 | 29184;29184;29184;29184;29184 | 671776;670848;674144;646816;673088 | |
507 | densenet1_stage4_relu32_fwd | Activation | [1,1824,7,7] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 178752 | 0.00 | 7573.33 | 45.40 | 23.60 | 44.69 | false | 0.455124;0.454983;0.452965;0.453624;0.453984 | 178752;178752;178752;178752;178752 | 9728;0;0;0;0 | 7456;7840;7200;7424;8064 | |
508 | densenet1_stage4_conv32_fwd | Convolution | [1,1824,7,7] | 2473 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 117.00 | 44836032 | 1400917.33 | 27072.00 | 3.10 | 31.40 | 383.21 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 44836032;44836032;44836032;44836032;44836032 | 1401088;1400832;1400832;1402624;1400832 | 26816;27200;26944;27712;27072 | |
509 | densenet1_stage4_batchnorm33_fwd | BatchNorm | [1,192,7,7] | 57.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 37845.33 | 6.80 | 1.61 | 16.46 | true | 0.068564;0.068707;0.068330;0.068267;0.068535 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;3072 | 37920;37536;38048;37792;37824 | |
510 | densenet1_stage4_relu33_fwd | Activation | [1,192,7,7] | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 18816 | 224.00 | 2901.33 | 29.40 | 6.02 | 4.03 | true | 0.323643;0.296044;0.293850;0.289185;0.290889 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 2816;3200;2304;2688;3872 | |
511 | densenet1_stage4_conv33_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.33 | 113541120 | 0.00 | 404064.00 | 6.20 | 281.00 | 4311.74 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 0;0;1792;0;0 | 405184;406048;412096;400960;392896 | |
511 | densenet1_stage4_conv33_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 85.33 | 734122.67 | 22.20 | 4.02 | 491.52 | true | 0.221976;0.221728;0.221634;0.221397;0.221873 | 2949120;2949120;2949120;2949120;2949120 | 256;0;0;0;6912 | 726240;730944;740320;733248;738176 | |
511 | densenet1_stage4_conv33_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.33 | 746496 | 331776.00 | 805674.67 | 12.30 | 0.66 | 139.98 | true | 0.123295;0.123156;0.123313;0.123244;0.123124 | 746496;746496;746496;746496;746496 | 807968;801248;794560;807808;813728 | 331776;331776;331776;331776;331776 | |
511 | densenet1_stage4_conv33_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 1088.00 | 399424.00 | 8.70 | 5.99 | 479.88 | true | 0.087488;0.087360;0.087563;0.087540;0.087469 | 2399424;2399424;2399424;2399424;2399424 | 576;2624;3136;64;64 | 399744;399328;396800;400096;399200 | |
512 | densenet1_stage4_concat16 | Concat | [1,1824,7,7] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 25664.00 | 15850.67 | 24.20 | 0.00 | 0.00 | true | 0.364633;0.119906;0.365178;0.119899;0.364552;0.119944;0.364841;0.119960;0.365919;0.119883 | 0;0;0;0;0;0;0;0;0;0 | 52608;0;55168;0;51584;0;51072;0;51328;0 | 21568;9728;22944;9728;23744;9472;22400;9888;21664;9856 | |
512 | densenet1_stage4_concat16 | Concat | [1,1824,7,7] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 25664.00 | 15850.67 | 24.20 | 0.00 | 0.00 | true | 0.364633;0.119906;0.365178;0.119899;0.364552;0.119944;0.364841;0.119960;0.365919;0.119883 | 0;0;0;0;0;0;0;0;0;0 | 52608;0;55168;0;51584;0;51072;0;51328;0 | 21568;9728;22944;9728;23744;9472;22400;9888;21664;9856 | |
513 | densenet1_stage4_batchnorm34_fwd | BatchNorm | [1,1872,7,7] | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 642096 | 29952.00 | 609194.67 | 21.10 | 1.00 | 91.73 | true | 0.210116;0.213088;0.209905;0.212970;0.205051 | 642096;642096;642096;642096;642096 | 29952;29952;29952;29952;29952 | 613312;604576;592320;611104;611904 | |
514 | densenet1_stage4_relu34_fwd | Activation | [1,1872,7,7] | 24 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 183456 | 0.00 | 32405.33 | 45.80 | 5.66 | 45.86 | true | 0.457684;0.458292;0.458891;0.459064;0.457364 | 183456;183456;183456;183456;183456 | 32288;36640;32512;32032;32416 | 0;0;0;0;0 | |
515 | densenet1_stage4_conv34_fwd | Convolution | [1,1872,7,7] | 2527.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 118.67 | 46015680 | 1437696.00 | 51541.33 | 3.10 | 30.90 | 387.77 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 46015680;46015680;46015680;46015680;46015680 | 50688;52736;51456;51200;51968 | 1437952;1437696;1437696;1437696;1437696 | |
516 | densenet1_stage4_batchnorm35_fwd | BatchNorm | [1,192,7,7] | 54.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 35818.67 | 6.80 | 1.69 | 16.46 | true | 0.068983;0.067709;0.069162;0.067866;0.068289 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;3072 | 37024;34976;36256;35456;35744 | |
517 | densenet1_stage4_relu35_fwd | Activation | [1,192,7,7] | 14.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 4224.00 | 29.10 | 4.23 | 4.70 | true | 0.289325;0.289305;0.286595;0.295472;0.295984 | 18816;18816;18816;18816;18816 | 224;224;224;2528;224 | 3968;4352;4096;4768;4224 | |
518 | densenet1_stage4_conv35_fwd | Convolution | [1,192,7,7] | 752.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.33 | 113541120 | 256.00 | 407381.33 | 6.20 | 278.53 | 4311.74 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 512;256;5120;0;0 | 413312;399136;420096;403744;405088 | |
518 | densenet1_stage4_conv35_fwd | Convolution | [1,192,7,7] | 752.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.67 | 2949120 | 0.00 | 737301.33 | 22.20 | 4.00 | 442.35 | true | 0.221943;0.222175;0.221921;0.222122;0.221837 | 2949120;2949120;2949120;2949120;2949120 | 256;0;0;0;0 | 685600;737440;735168;739296;741760 | |
518 | densenet1_stage4_conv35_fwd | Convolution | [1,192,7,7] | 752.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 820661.33 | 12.30 | 0.65 | 149.30 | true | 0.123135;0.123058;0.123258;0.123178;0.123103 | 746496;746496;746496;746496;746496 | 853632;804576;818400;819520;824064 | 331776;331776;331776;331776;331776 | |
518 | densenet1_stage4_conv35_fwd | Convolution | [1,192,7,7] | 752.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 1920.00 | 395381.33 | 8.70 | 6.04 | 479.88 | true | 0.087345;0.087323;0.087218;0.087265;0.087520 | 2399424;2399424;2399424;2399424;2399424 | 512;7424;5248;0;0 | 397728;386720;389664;398752;399488 | |
519 | densenet1_stage4_concat17 | Concat | [1,1872,7,7] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.29 | 0 | 48554.67 | 18245.33 | 22.70 | 0.00 | 0.00 | true | 0.335328;0.119952;0.333701;0.119937;0.335051;0.119928;0.332985;0.119937;0.338735;0.119943 | 0;0;0;0;0;0;0;0;0;0 | 15488;19104;15648;19232;18208;15648;17952;29696;26816;19328 | 97280;0;98944;0;96128;0;92800;5120;100224;0 | |
519 | densenet1_stage4_concat17 | Concat | [1,1872,7,7] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.29 | 0 | 48554.67 | 18245.33 | 22.70 | 0.00 | 0.00 | true | 0.335328;0.119952;0.333701;0.119937;0.335051;0.119928;0.332985;0.119937;0.338735;0.119943 | 0;0;0;0;0;0;0;0;0;0 | 15488;19104;15648;19232;18208;15648;17952;29696;26816;19328 | 97280;0;98944;0;96128;0;92800;5120;100224;0 | |
520 | densenet1_stage4_batchnorm36_fwd | BatchNorm | [1,1920,7,7] | 36 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 658560 | 30720.00 | 698517.33 | 20.10 | 0.90 | 94.08 | true | 0.195952;0.200121;0.204263;0.201366;0.200484 | 658560;658560;658560;658560;658560 | 30720;30720;30720;30720;30720 | 703616;704800;699968;690432;691968 | |
521 | densenet1_stage4_relu36_fwd | Activation | [1,1920,7,7] | 25 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 188160 | 0.00 | 8362.67 | 46.40 | 22.50 | 43.42 | false | 0.465047;0.464362;0.463698;0.464171;0.463544 | 188160;188160;188160;188160;188160 | 5120;0;0;0;0 | 8192;8448;8480;8448;7712 | |
522 | densenet1_stage4_conv36_fwd | Convolution | [1,1920,7,7] | 2632 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 121.00 | 47195328 | 1474816.00 | 26858.67 | 3.10 | 31.43 | 390.04 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 47195328;47195328;47195328;47195328;47195328 | 26688;26560;26944;27456;26944 | 1474816;1475072;1474560;1491968;1474560 | |
523 | densenet1_stage4_batchnorm37_fwd | BatchNorm | [1,192,7,7] | 54.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 35018.67 | 6.90 | 1.73 | 16.46 | true | 0.068519;0.068460;0.068377;0.068571;0.068636 | 65856;65856;65856;65856;65856 | 35360;34976;35360;34720;34464 | 3072;3072;3072;7424;3072 | |
524 | densenet1_stage4_relu37_fwd | Activation | [1,192,7,7] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 5034.67 | 29.40 | 3.58 | 4.70 | true | 0.287092;0.298107;0.301908;0.291887;0.291957 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 4736;6016;4608;4736;5632 | |
525 | densenet1_stage4_conv37_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 416224.00 | 6.20 | 272.79 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 0;0;0;0;0 | 417600;415776;421056;415296;411648 | |
525 | densenet1_stage4_conv37_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 0.00 | 761514.67 | 22.20 | 3.87 | 491.52 | true | 0.224192;0.221509;0.221656;0.221840;0.222189 | 2949120;2949120;2949120;2949120;2949120 | 11008;0;0;0;0 | 771712;766400;758048;760096;676320 | |
525 | densenet1_stage4_conv37_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 822624.00 | 12.30 | 0.65 | 149.30 | true | 0.123256;0.123198;0.123244;0.123220;0.123115 | 746496;746496;746496;746496;746496 | 331776;331776;331776;331776;333568 | 804096;819072;818688;830112;897312 | |
525 | densenet1_stage4_conv37_fwd | Convolution | [1,192,7,7] | 753 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 234.67 | 399584.00 | 8.70 | 6.00 | 479.88 | true | 0.087454;0.087290;0.087579;0.087689;0.087412 | 2399424;2399424;2399424;2399424;2399424 | 400000;399040;386912;399712;400224 | 576;64;11968;64;64 | |
526 | densenet1_stage4_concat18 | Concat | [1,1920,7,7] | 28.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 25941.33 | 18240.00 | 24.40 | 0.00 | 0.00 | true | 0.370116;0.119925;0.369083;0.119899;0.370991;0.119941;0.365801;0.119916;0.370802;0.119895 | 0;0;0;0;0;0;0;0;0;0 | 52224;0;51840;0;62080;0;51584;0;59136;0 | 27296;9728;25888;14496;25280;9856;22784;9728;33024;11136 | |
526 | densenet1_stage4_concat18 | Concat | [1,1920,7,7] | 28.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 25941.33 | 18240.00 | 24.40 | 0.00 | 0.00 | true | 0.370116;0.119925;0.369083;0.119899;0.370991;0.119941;0.365801;0.119916;0.370802;0.119895 | 0;0;0;0;0;0;0;0;0;0 | 52224;0;51840;0;62080;0;51584;0;59136;0 | 27296;9728;25888;14496;25280;9856;22784;9728;33024;11136 | |
527 | densenet1_stage4_batchnorm38_fwd | BatchNorm | [1,1968,7,7] | 39 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 675024 | 31488.00 | 625781.33 | 20.70 | 1.03 | 96.43 | true | 0.207717;0.207362;0.207192;0.199961;0.219025 | 675024;675024;675024;675024;675024 | 31488;31488;31488;31488;31488 | 627392;618208;630080;634944;619872 | |
528 | densenet1_stage4_relu38_fwd | Activation | [1,1968,7,7] | 25.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 192864 | 0.00 | 36373.33 | 47.00 | 5.30 | 48.22 | true | 0.470174;0.468077;0.469699;0.470246;0.469625 | 192864;192864;192864;192864;192864 | 0;1024;0;0;0 | 35968;34816;36256;37408;36896 | |
529 | densenet1_stage4_conv38_fwd | Convolution | [1,1968,7,7] | 2653.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 124.33 | 48374976 | 1511424.00 | 48682.67 | 3.10 | 31.01 | 389.08 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 48374976;48374976;48374976;48374976;48374976 | 1511680;1511424;1511424;1511424;1511424 | 50944;50944;46208;47104;48000 | |
530 | densenet1_stage4_batchnorm39_fwd | BatchNorm | [1,192,7,7] | 58 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.67 | 65856 | 3072.00 | 36042.67 | 6.70 | 1.68 | 14.11 | true | 0.066662;0.069039;0.066582;0.067522;0.067377 | 65856;65856;65856;65856;65856 | 3072;3072;5376;3072;3072 | 35616;35744;37792;36640;35744 | |
531 | densenet1_stage4_relu39_fwd | Activation | [1,192,7,7] | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 3968.00 | 29.30 | 4.49 | 4.70 | true | 0.292423;0.294829;0.287993;0.291056;0.296303 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 4224;3712;4096;3840;3968 | |
532 | densenet1_stage4_conv39_fwd | Convolution | [1,192,7,7] | 757.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 414624.00 | 6.20 | 273.84 | 4366.97 | false | 0.062455;0.062455;0.062454;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 512;0;0;0;0 | 416992;413376;415008;415488;413056 | |
532 | densenet1_stage4_conv39_fwd | Convolution | [1,192,7,7] | 757.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.33 | 2949120 | 0.00 | 761098.67 | 22.20 | 3.87 | 465.68 | true | 0.221745;0.221823;0.222008;0.223519;0.221978 | 2949120;2949120;2949120;2949120;2949120 | 256;0;0;0;0 | 755680;761504;767968;752032;766112 | |
532 | densenet1_stage4_conv39_fwd | Convolution | [1,192,7,7] | 757.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 840960.00 | 12.30 | 0.64 | 149.30 | true | 0.123163;0.123325;0.123037;0.123243;0.123272 | 746496;746496;746496;746496;746496 | 331776;331776;331776;331776;331776 | 839872;842528;840480;857280;836032 | |
532 | densenet1_stage4_conv39_fwd | Convolution | [1,192,7,7] | 757.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 0.00 | 399146.67 | 8.70 | 6.01 | 479.88 | true | 0.087461;0.087378;0.087252;0.087327;0.087489 | 2399424;2399424;2399424;2399424;2399424 | 512;0;0;0;0 | 398880;399168;398880;399392;399392 | |
533 | densenet1_stage4_concat19 | Concat | [1,1968,7,7] | 25.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.29 | 0 | 48320.00 | 22272.00 | 23.40 | 0.00 | 0.00 | true | 0.349891;0.119914;0.349934;0.119926;0.347388;0.119886;0.352055;0.119889;0.346976;0.119902 | 0;0;0;0;0;0;0;0;0;0 | 100864;0;96640;0;102144;0;96640;0;96640;0 | 25248;22528;22816;20224;25760;19968;22592;19968;25888;20224 | |
533 | densenet1_stage4_concat19 | Concat | [1,1968,7,7] | 25.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 48320.00 | 22272.00 | 23.40 | 0.00 | 0.00 | true | 0.349891;0.119914;0.349934;0.119926;0.347388;0.119886;0.352055;0.119889;0.346976;0.119902 | 0;0;0;0;0;0;0;0;0;0 | 25248;22528;22816;20224;25760;19968;22592;19968;25888;20224 | 100864;0;96640;0;102144;0;96640;0;96640;0 | |
534 | densenet1_stage4_batchnorm40_fwd | BatchNorm | [1,2016,7,7] | 35 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.33 | 691488 | 32256.00 | 732224.00 | 21.50 | 0.90 | 94.30 | true | 0.225732;0.218492;0.214807;0.210884;0.207684 | 691488;691488;691488;691488;691488 | 32256;37888;32256;32256;32256 | 728224;733312;732416;735264;730944 | |
535 | densenet1_stage4_relu40_fwd | Activation | [1,2016,7,7] | 26 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 197568 | 0.00 | 9056.00 | 47.60 | 21.82 | 45.60 | false | 0.476384;0.475923;0.475694;0.475363;0.476930 | 197568;197568;197568;197568;197568 | 9728;8192;8576;9056;9536 | 0;256;0;0;0 | |
536 | densenet1_stage4_conv40_fwd | Convolution | [1,2016,7,7] | 2726.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 127.67 | 49554624 | 1548288.00 | 26773.33 | 3.10 | 31.46 | 388.16 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 49554624;49554624;49554624;49554624;49554624 | 1548544;1548288;1548288;1548288;1548288 | 26304;26688;26688;26944;27072 | |
537 | densenet1_stage4_batchnorm41_fwd | BatchNorm | [1,192,7,7] | 52.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 35701.33 | 6.80 | 1.70 | 16.46 | true | 0.068519;0.067807;0.068356;0.068540;0.068458 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;3072 | 37408;35616;35872;35456;35616 | |
538 | densenet1_stage4_relu41_fwd | Activation | [1,192,7,7] | 16.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 5258.67 | 29.10 | 3.43 | 4.70 | true | 0.295320;0.286802;0.290551;0.291700;0.290216 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 3840;5632;5376;5152;5248 | |
539 | densenet1_stage4_conv41_fwd | Convolution | [1,192,7,7] | 757.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 427093.33 | 6.20 | 265.85 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 512;0;0;0;0 | 428448;429216;424864;405792;427968 | |
539 | densenet1_stage4_conv41_fwd | Convolution | [1,192,7,7] | 757.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.00 | 2949120 | 0.00 | 779306.67 | 22.20 | 3.78 | 491.52 | true | 0.221631;0.221804;0.221742;0.223617;0.221964 | 2949120;2949120;2949120;2949120;2949120 | 256;0;0;0;0 | 776672;782560;778688;789056;775840 | |
539 | densenet1_stage4_conv41_fwd | Convolution | [1,192,7,7] | 757.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331818.67 | 858250.67 | 12.30 | 0.63 | 149.30 | true | 0.123106;0.123190;0.123069;0.123236;0.123118 | 746496;746496;746496;746496;746496 | 331776;331840;333312;331776;331840 | 856384;843168;856992;878048;861376 | |
539 | densenet1_stage4_conv41_fwd | Convolution | [1,192,7,7] | 757.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 42.67 | 397632.00 | 8.70 | 6.03 | 479.88 | true | 0.087564;0.087364;0.087627;0.087282;0.087379 | 2399424;2399424;2399424;2399424;2399424 | 64;0;64;6464;0 | 397280;398240;398688;376416;397376 | |
540 | densenet1_stage4_concat20 | Concat | [1,2016,7,7] | 29.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 38336.00 | 12752.00 | 25.40 | 0.00 | 0.00 | true | 0.387778;0.119903;0.391346;0.119906;0.387661;0.119958;0.387809;0.119957;0.386880;0.119904 | 0;0;0;0;0;0;0;0;0;0 | 80896;256;74752;0;78336;0;94464;0;76672;0 | 27552;8064;11712;11904;12320;12800;14656;12672;14752;12160 | |
540 | densenet1_stage4_concat20 | Concat | [1,2016,7,7] | 29.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.71 | 0 | 38336.00 | 12752.00 | 25.40 | 0.00 | 0.00 | true | 0.387778;0.119903;0.391346;0.119906;0.387661;0.119958;0.387809;0.119957;0.386880;0.119904 | 0;0;0;0;0;0;0;0;0;0 | 80896;256;74752;0;78336;0;94464;0;76672;0 | 27552;8064;11712;11904;12320;12800;14656;12672;14752;12160 | |
541 | densenet1_stage4_batchnorm42_fwd | BatchNorm | [1,2064,7,7] | 39.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.00 | 707952 | 33024.00 | 664256.00 | 20.80 | 1.02 | 101.14 | true | 0.207867;0.201457;0.202879;0.215274;0.212720 | 707952;707952;707952;707952;707952 | 636832;665600;666016;661152;667424 | 33024;33024;33024;33024;33024 | |
542 | densenet1_stage4_relu42_fwd | Activation | [1,2064,7,7] | 27 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 202272 | 0.00 | 36405.33 | 48.10 | 5.56 | 46.68 | true | 0.479681;0.480291;0.481578;0.481504;0.480469 | 202272;202272;202272;202272;202272 | 35744;38944;37024;36448;35232 | 0;0;5888;0;0 | |
543 | densenet1_stage4_conv42_fwd | Convolution | [1,2064,7,7] | 2773.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 131.00 | 50734272 | 1586005.33 | 57525.33 | 3.10 | 30.87 | 387.28 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50734272;50734272;50734272;50734272;50734272 | 1585408;1585152;1587456;1585152;1588992 | 59424;56320;56448;60416;56704 | |
544 | densenet1_stage4_batchnorm43_fwd | BatchNorm | [1,192,7,7] | 53.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 37216.00 | 6.80 | 1.63 | 16.46 | true | 0.068388;0.068338;0.068438;0.068130;0.068187 | 65856;65856;65856;65856;65856 | 37344;38176;37152;37152;36736 | 3072;3072;3072;3072;3072 | |
545 | densenet1_stage4_relu43_fwd | Activation | [1,192,7,7] | 16 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 4874.67 | 29.30 | 3.69 | 4.70 | true | 0.290170;0.294405;0.291705;0.342973;0.293242 | 18816;18816;18816;18816;18816 | 4736;3584;4864;5120;5024 | 224;224;224;224;224 | |
546 | densenet1_stage4_conv43_fwd | Convolution | [1,192,7,7] | 761.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 424085.33 | 6.20 | 267.73 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 512;0;0;0;0 | 425504;421696;433472;423328;423424 | |
546 | densenet1_stage4_conv43_fwd | Convolution | [1,192,7,7] | 761.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.33 | 2949120 | 0.00 | 790698.67 | 22.20 | 3.73 | 465.68 | true | 0.221702;0.221701;0.223915;0.221803;0.221447 | 2949120;2949120;2949120;2949120;2949120 | 256;0;0;0;0 | 788864;790080;792640;789376;796576 | |
546 | densenet1_stage4_conv43_fwd | Convolution | [1,192,7,7] | 761.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 877162.67 | 12.30 | 0.62 | 149.30 | true | 0.123273;0.123085;0.123084;0.123219;0.123299 | 746496;746496;746496;746496;746496 | 331776;331776;331776;331776;331776 | 877440;876480;884928;877568;875680 | |
546 | densenet1_stage4_conv43_fwd | Convolution | [1,192,7,7] | 761.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 1024.00 | 398954.67 | 8.70 | 6.00 | 479.88 | true | 0.087660;0.087512;0.087510;0.087217;0.087312 | 2399424;2399424;2399424;2399424;2399424 | 400064;396864;384320;399936;400064 | 512;2560;4736;0;0 | |
547 | densenet1_stage4_concat21 | Concat | [1,2064,7,7] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 53536.00 | 30741.33 | 24.20 | 0.00 | 0.00 | true | 0.360436;0.119959;0.363226;0.119954;0.366693;0.119909;0.367227;0.119915;0.369610;0.119915 | 0;0;0;0;0;0;0;0;0;0 | 107072;0;106944;0;115648;0;107200;0;112320;0 | 34944;26368;36992;26112;35712;25856;35200;26112;38016;25600 | |
547 | densenet1_stage4_concat21 | Concat | [1,2064,7,7] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.00 | 0 | 53536.00 | 30741.33 | 24.20 | 0.00 | 0.00 | true | 0.360436;0.119959;0.363226;0.119954;0.366693;0.119909;0.367227;0.119915;0.369610;0.119915 | 0;0;0;0;0;0;0;0;0;0 | 107072;0;106944;0;115648;0;107200;0;112320;0 | 34944;26368;36992;26112;35712;25856;35200;26112;38016;25600 | |
548 | densenet1_stage4_batchnorm44_fwd | BatchNorm | [1,2112,7,7] | 37 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.67 | 724416 | 33792.00 | 754442.67 | 20.70 | 0.92 | 94.48 | true | 0.199650;0.210875;0.208119;0.203356;0.212160 | 724416;724416;724416;724416;724416 | 33792;33792;33792;33792;33792 | 755200;754240;756416;751840;753888 | |
549 | densenet1_stage4_relu44_fwd | Activation | [1,2112,7,7] | 26.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 206976 | 0.00 | 8672.00 | 48.60 | 23.87 | 51.74 | false | 0.486272;0.486033;0.485658;0.486281;0.485997 | 206976;206976;206976;206976;206976 | 288;0;0;0;0 | 9696;8064;8064;8608;9344 | |
550 | densenet1_stage4_conv44_fwd | Convolution | [1,2112,7,7] | 2859.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 135.00 | 51913920 | 1622016.00 | 26944.00 | 3.10 | 31.48 | 384.55 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 51913920;51913920;51913920;51913920;51913920 | 1623040;1622016;1622016;1622016;1622016 | 28000;26944;26944;26944;26688 | |
551 | densenet1_stage4_batchnorm45_fwd | BatchNorm | [1,192,7,7] | 57.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 35424.00 | 6.80 | 1.71 | 16.46 | true | 0.068386;0.068096;0.067782;0.068350;0.068256 | 65856;65856;65856;65856;65856 | 35328;35104;35872;35840;34816 | 3072;3072;3072;3072;3072 | |
552 | densenet1_stage4_relu45_fwd | Activation | [1,192,7,7] | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 5098.67 | 29.20 | 3.54 | 4.70 | true | 0.288002;0.305698;0.290695;0.293364;0.292565 | 18816;18816;18816;18816;18816 | 224;224;224;224;224 | 5408;5248;4608;4640;5536 | |
553 | densenet1_stage4_conv45_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 85.33 | 433418.67 | 6.20 | 261.91 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 432896;435872;427136;431488;442944 | 832;0;256;0;0 | |
553 | densenet1_stage4_conv45_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.67 | 2949120 | 170.67 | 801514.67 | 22.20 | 3.68 | 442.35 | true | 0.221803;0.221611;0.221562;0.221837;0.222029 | 2949120;2949120;2949120;2949120;2949120 | 769760;783264;825824;804128;817152 | 512;5632;0;0;0 | |
553 | densenet1_stage4_conv45_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 890485.33 | 12.30 | 0.61 | 149.30 | true | 0.123291;0.123295;0.123101;0.123194;0.123118 | 746496;746496;746496;746496;746496 | 910912;902656;869248;879936;888864 | 331840;331776;331776;331776;331776 | |
553 | densenet1_stage4_conv45_fwd | Convolution | [1,192,7,7] | 754.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 213.33 | 399296.00 | 8.70 | 6.01 | 479.88 | true | 0.087484;0.087426;0.087463;0.087499;0.087787 | 2399424;2399424;2399424;2399424;2399424 | 512;64;576;64;64 | 399744;399712;398944;399232;388448 | |
554 | densenet1_stage4_concat22 | Concat | [1,2112,7,7] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.14 | 0 | 72106.67 | 35050.67 | 25.70 | 0.00 | 0.00 | true | 0.405154;0.119900;0.396089;0.119906;0.392346;0.119926;0.393469;0.119947;0.393461;0.119864 | 0;0;0;0;0;0;0;0;0;0 | 49472;27392;49344;17280;50944;16768;50976;16896;49280;17536 | 146816;0;142592;0;143232;0;156416;0;158976;0 | |
554 | densenet1_stage4_concat22 | Concat | [1,2112,7,7] | 30.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 3.86 | 0 | 72106.67 | 35050.67 | 25.70 | 0.00 | 0.00 | true | 0.405154;0.119900;0.396089;0.119906;0.392346;0.119926;0.393469;0.119947;0.393461;0.119864 | 0;0;0;0;0;0;0;0;0;0 | 49472;27392;49344;17280;50944;16768;50976;16896;49280;17536 | 146816;0;142592;0;143232;0;156416;0;158976;0 | |
555 | densenet1_stage4_batchnorm46_fwd | BatchNorm | [1,2160,7,7] | 41.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.33 | 740880 | 34560.00 | 654282.67 | 21.10 | 1.08 | 101.03 | true | 0.209642;0.209962;0.218142;0.212169;0.208327 | 740880;740880;740880;740880;740880 | 34560;34560;34560;34560;34560 | 652288;664832;649312;638272;661248 | |
556 | densenet1_stage4_relu46_fwd | Activation | [1,2160,7,7] | 28.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 211680 | 0.00 | 35978.67 | 49.50 | 5.88 | 45.36 | true | 0.503704;0.516721;0.490843;0.490491;0.489463 | 211680;211680;211680;211680;211680 | 35104;34432;36576;36256;38048 | 0;0;0;10496;0 | |
557 | densenet1_stage4_conv46_fwd | Convolution | [1,2160,7,7] | 2956 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 137.33 | 53093568 | 1658880.00 | 98442.67 | 3.10 | 30.21 | 386.60 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 53093568;53093568;53093568;53093568;53093568 | 1665536;1658880;1658880;1658880;1658880 | 99584;94880;100352;100992;95392 | |
558 | densenet1_stage4_batchnorm47_fwd | BatchNorm | [1,192,7,7] | 55 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 4.00 | 65856 | 3072.00 | 36437.33 | 6.70 | 1.67 | 16.46 | true | 0.066758;0.067708;0.068230;0.064961;0.068027 | 65856;65856;65856;65856;65856 | 3072;3072;3072;3072;3072 | 37504;34976;37920;36096;35712 | |
559 | densenet1_stage4_relu47_fwd | Activation | [1,192,7,7] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 18816 | 224.00 | 4298.67 | 29.10 | 4.16 | 4.70 | true | 0.291470;0.291388;0.286541;0.290357;0.295047 | 18816;18816;18816;18816;18816 | 128;224;224;224;224 | 4256;4992;4096;4128;4512 | |
560 | densenet1_stage4_conv47_fwd | Convolution | [1,192,7,7] | 750.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_sgemm_128x64_nn | 26.00 | 113541120 | 0.00 | 422208.00 | 6.20 | 268.92 | 4366.97 | false | 0.062455;0.062455;0.062455;0.062455;0.062455 | 113541120;113541120;113541120;113541120;113541120 | 0;0;0;0;0 | 421664;424384;422080;422880;410208 | |
560 | densenet1_stage4_conv47_fwd | Convolution | [1,192,7,7] | 750.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardData4x4<float, float>(cudnn::winograd_nonfused::WinogradDataParams<float, float>) | 6.33 | 2949120 | 0.00 | 797610.67 | 22.20 | 3.70 | 465.68 | true | 0.221676;0.221900;0.221699;0.221856;0.221636 | 2949120;2949120;2949120;2949120;2949120 | 0;0;0;0;0 | 817120;814880;761600;759552;816352 | |
560 | densenet1_stage4_conv47_fwd | Convolution | [1,192,7,7] | 750.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardFilter4x4<float, float>(cudnn::winograd_nonfused::WinogradFilterParams<float, float>) | 5.00 | 746496 | 331776.00 | 909024.00 | 12.30 | 0.60 | 149.30 | true | 0.123329;0.123175;0.123036;0.123269;0.123120 | 746496;746496;746496;746496;746496 | 894432;892672;920768;925312;911872 | 331776;331776;332032;331776;331776 | |
560 | densenet1_stage4_conv47_fwd | Convolution | [1,192,7,7] | 750.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd_nonfused::winogradForwardOutput4x4<float, float>(cudnn::winograd_nonfused::WinogradOutputParams<float, float>) | 5.00 | 2399424 | 1877.33 | 396736.00 | 8.70 | 6.02 | 479.88 | true | 0.087639;0.087624;0.087218;0.087334;0.087374 | 2399424;2399424;2399424;2399424;2399424 | 399936;390464;400064;399808;388288 | 0;6144;0;0;5632 | |
561 | densenet1_stage4_concat23 | Concat | [1,2160,7,7] | 27.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.29 | 0 | 89781.33 | 44864.00 | 25.00 | 0.00 | 0.00 | true | 0.379353;0.119917;0.386606;0.119936;0.379666;0.119896;0.379335;0.119900;0.384729;0.119938 | 0;0;0;0;0;0;0;0;0;0 | 70272;21760;64768;22400;70016;21248;68736;21760;69760;21632 | 178752;0;184384;2048;177216;0;180672;0;182080;0 | |
561 | densenet1_stage4_concat23 | Concat | [1,2160,7,7] | 27.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::saveto, 8, mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float> >(mshadow::expr::Plan<mshadow::expr::SliceExp<mshadow::Tensor<mshadow::gpu, 3, float>, mshadow::gpu, float, 3, 2>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 3, float>, float>) | 4.14 | 0 | 89781.33 | 44864.00 | 25.00 | 0.00 | 0.00 | true | 0.379353;0.119917;0.386606;0.119936;0.379666;0.119896;0.379335;0.119900;0.384729;0.119938 | 0;0;0;0;0;0;0;0;0;0 | 70272;21760;64768;22400;70016;21248;68736;21760;69760;21632 | 178752;0;184384;2048;177216;0;180672;0;182080;0 | |
562 | densenet1_batchnorm4_fwd | BatchNorm | [1,2208,7,7] | 37.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::bn_fw_inf_1C11_kernel_NCHW<float, float, true, 1>(float, float, cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnTensorStruct, float const*, float const*, float const*, float const*, float) | 7.67 | 757344 | 35328.00 | 765664.00 | 21.50 | 0.95 | 98.78 | true | 0.207923;0.213055;0.215924;0.217269;0.216910 | 757344;757344;757344;757344;757344 | 35328;35328;35328;35328;35328 | 764576;759072;766112;767040;766304 | |
563 | densenet1_relu4_fwd | Activation | [1,2208,7,7] | 28 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 216384 | 0.00 | 7466.67 | 49.60 | 28.98 | 49.94 | false | 0.496615;0.517271;0.494973;0.494049;0.496145 | 216384;216384;216384;216384;216384 | 0;0;0;0;0 | 7808;8192;7040;7296;7296 | |
564 | densenet1_pool4_fwd | Pooling | [1,2208,7,7] | 159 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 1, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 8.00 | 154953 | 2816.00 | 10112.00 | 11.40 | 11.99 | 19.37 | true | 0.111014;0.114589;0.114077;0.113998;0.114197 | 154953;154953;154953;154953;154953 | 2560;3328;2816;2816;2816 | 10112;10112;9088;10240;10112 | |
566 | densenet1_dense0_fwd | FullyConnected | [1,2208] | 2464.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void gemv2T_kernel_val<int, int, float, float, float, 128, 16, 4, 4, false, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>, float, float) | 22.00 | 4787000 | 8836928.00 | 2668352.00 | 9.80 | 0.42 | 217.59 | true | 0.098506;0.098056;0.098362;0.098163;0.098407 | 4787000;4787000;4787000;4787000;4787000 | 8836160;8836928;8836928;8837184;8836928 | 2669408;2669504;2654752;2678112;2666144 | |
566 | densenet1_dense0_fwd | FullyConnected | [1,2208] | 2464.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 3.33 | 1000 | 4992.00 | 85.33 | 12.30 | 0.20 | 0.30 | true | 0.122660;0.122647;0.123140;0.122716;0.122514 | 1000;1000;1000;1000;1000 | 0;384;256;0;0 | 5248;4992;4864;4992;4992 |
Showing 1 to 771 of 771 entries