GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_write_bytes | dram_read_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_write_bytes | dram_read_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | vgg4_conv0_fwd | Convolution | [1,3,224,224] | 38647 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 37.67 | 16056320 | 11155936.00 | 12693749.33 | 86.40 | 0.67 | 426.27 | true | 0.863154;0.863947;0.859803;0.866616;0.865145 | 16056320;16056320;16056320;16056320;16056320 | 11152352;11122656;11171168;11165280;11150176 | 12714496;12694432;12692832;12693984;12665216 | |
0 | vgg4_conv0_fwd | Convolution | [1,3,224,224] | 38647 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_128x64_relu_small_nn_v1 | 31.67 | 211943424 | 8181.33 | 10622112.00 | 20.50 | 19.94 | 6692.88 | false | 0.204721;0.206162;0.204812;0.203000;0.204566 | 211943424;211943424;211943424;211943424;211943424 | 10144;7584;7584;9376;7328 | 10627616;10602592;10620704;10620960;10624672 | |
0 | vgg4_conv0_fwd | Convolution | [1,3,224,224] | 38647 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.33 | 0 | 96.00 | 0.00 | 21.80 | 0.00 | 0.00 | true | 0.217825;0.217914;0.218530;0.217638;0.218609 | 0;0;0;0;0 | 0;0;0;0;0 | 96;96;96;96;352 | |
1 | vgg4_relu0_fwd | Activation | [1,64,224,224] | 867.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 6422528 | 12845408.00 | 12840032.00 | 94.40 | 0.25 | 178.40 | true | 0.945219;0.944952;0.935417;0.943454;0.942278 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12845408;12845408;12845408 | 12840768;12839072;12840256;12838816;12844480 | |
2 | vgg4_conv1_fwd | Convolution | [1,64,224,224] | 259250 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 174.33 | 1742110720 | 20125429.33 | 13640597.33 | 24.00 | 51.59 | 9993.01 | false | 0.240624;0.239629;0.238519;0.239866;0.240415 | 1742110720;1742110720;1742110720;1742110720;1742110720 | 13707968;13619456;13713600;13565728;13594368 | 19425248;19991264;20285536;20099488;20289184 | |
2 | vgg4_conv1_fwd | Convolution | [1,64,224,224] | 259250 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 38.00 | 16056320 | 11809269.33 | 11593632.00 | 85.40 | 0.69 | 422.53 | true | 0.858601;0.851358;0.840421;0.865987;0.853222 | 16056320;16056320;16056320;16056320;16056320 | 11836384;11773216;11837088;11818208;11767392 | 11551136;11597280;11532480;11666368;11632480 | |
2 | vgg4_conv1_fwd | Convolution | [1,64,224,224] | 259250 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.67 | 237568 | 147648.00 | 368842.67 | 6.20 | 0.46 | 41.92 | true | 0.062398;0.062393;0.062393;0.062379;0.062396 | 237568;237568;237568;237568;237568 | 147648;147648;147648;147648;153024 | 368544;369440;358048;368544;379808 | |
3 | vgg4_relu1_fwd | Activation | [1,64,224,224] | 920.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 36.00 | 6422528 | 12845408.00 | 12840768.00 | 94.10 | 0.25 | 178.40 | true | 0.941711;0.942920;0.941824;0.940615;0.940779 | 6422528;6422528;6422528;6422528;6422528 | 12845408;12845408;12845408;12847200;12845408 | 12841152;12838848;12842304;12844864;12830144 | |
4 | vgg4_pool0_fwd | Pooling | [1,64,224,224] | 6218.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 26.00 | 802816 | 12715328.00 | 4473802.67 | 69.00 | 0.05 | 30.88 | true | 0.689647;0.690969;0.688160;0.685757;0.691203 | 802816;802816;802816;802816;802816 | 4458656;4545696;4418080;4468256;4494496 | 12709056;12735936;12700992;12688704;12736704 | |
5 | vgg4_conv2_fwd | Convolution | [1,64,112,112] | 111785.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 88.00 | 871055360 | 2490954.67 | 6468000.00 | 23.30 | 97.23 | 9898.36 | false | 0.233744;0.233205;0.233260;0.232438;0.235391 | 871055360;871055360;871055360;871055360;871055360 | 6399200;6314016;6519008;6502688;6502112 | 2536096;2474400;2516896;2481568;2433248 | |
5 | vgg4_conv2_fwd | Convolution | [1,64,112,112] | 111785.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 19.67 | 8028160 | 3861536.00 | 4749269.33 | 77.50 | 0.93 | 408.20 | true | 0.788343;0.777523;0.773097;0.774581;0.767495 | 8028160;8028160;8028160;8028160;8028160 | 3862048;3875296;3908832;3803488;3847264 | 4800640;4867360;4729568;4700320;4717600 | |
5 | vgg4_conv2_fwd | Convolution | [1,64,112,112] | 111785.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 475136 | 295104.00 | 230144.00 | 6.20 | 0.90 | 95.03 | true | 0.062346;0.062346;0.062347;0.062342;0.062336 | 475136;475136;475136;475136;475136 | 295104;295104;295104;295104;295104 | 231808;200704;247680;241920;216704 | |
6 | vgg4_relu2_fwd | Activation | [1,128,112,112] | 435.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 14.67 | 3211264 | 1267488.00 | 6296096.00 | 85.40 | 0.42 | 218.94 | true | 0.853032;0.850518;0.858362;0.853900;0.854389 | 3211264;3211264;3211264;3211264;3211264 | 1263648;1273760;1239968;1265056;1291040 | 6290656;6299520;6304000;6269952;6298112 | |
7 | vgg4_conv3_fwd | Convolution | [1,128,112,112] | 225271 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 159.00 | 1725251584 | 10850549.33 | 7605632.00 | 23.40 | 93.48 | 10850.64 | false | 0.236329;0.233352;0.231537;0.234110;0.234227 | 1725251584;1725251584;1725251584;1725251584;1725251584 | 11725088;10409888;11753440;10416672;10102880 | 7628672;7615360;7651072;7572864;7496192 | |
7 | vgg4_conv3_fwd | Convolution | [1,128,112,112] | 225271 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 20.00 | 8028160 | 4585312.00 | 4638560.00 | 79.50 | 0.87 | 401.41 | true | 0.790571;0.800014;0.803414;0.792925;0.786697 | 8028160;8028160;8028160;8028160;8028160 | 4582304;4684640;4546976;4518240;4626656 | 4621088;4630880;4557152;4663712;4707904 | |
7 | vgg4_conv3_fwd | Convolution | [1,128,112,112] | 225271 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 950272 | 590016.00 | 1118826.67 | 9.80 | 0.56 | 158.38 | true | 0.098731;0.098470;0.098322;0.098203;0.098542 | 950272;950272;950272;950272;950272 | 590016;590016;590016;590016;590016 | 1122176;1112800;1111296;1139008;1121504 | |
8 | vgg4_relu3_fwd | Activation | [1,128,112,112] | 433.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 14.33 | 3211264 | 1309216.00 | 6334997.33 | 85.90 | 0.42 | 224.05 | true | 0.853369;0.859163;0.862380;0.860131;0.858501 | 3211264;3211264;3211264;3211264;3211264 | 1313824;1305504;1256224;1328032;1308320 | 6340864;6310592;6339648;6348544;6324480 | |
9 | vgg4_pool1_fwd | Pooling | [1,128,112,112] | 3172.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 11.00 | 401408 | 1780714.67 | 2483594.67 | 58.20 | 0.09 | 36.49 | true | 0.581095;0.577383;0.585481;0.587245;0.579122 | 401408;401408;401408;401408;401408 | 1722048;1764800;1826624;1766208;1811136 | 2446496;2483136;2491872;2475776;2501696 | |
10 | vgg4_conv4_fwd | Convolution | [1,128,56,56] | 103749.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 97.33 | 985858048 | 592117.33 | 4578762.67 | 22.70 | 190.66 | 10128.71 | false | 0.229494;0.230762;0.220570;0.232456;0.221720 | 985858048;985858048;985858048;985858048;985858048 | 786528;592352;639200;544800;534752 | 4427680;4549312;4601152;4585824;4626560 | |
10 | vgg4_conv4_fwd | Convolution | [1,128,56,56] | 103749.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11.00 | 4014080 | 487840.00 | 863274.67 | 71.70 | 2.97 | 364.92 | true | 0.718085;0.719751;0.718776;0.715275;0.714489 | 4014080;4014080;4014080;4014080;4014080 | 407552;448352;502528;512640;523392 | 829312;835296;912256;952704;842272 | |
10 | vgg4_conv4_fwd | Convolution | [1,128,56,56] | 103749.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 7.00 | 1900544 | 1179840.00 | 1205568.00 | 17.10 | 0.80 | 271.51 | true | 0.171609;0.171074;0.170953;0.171644;0.171457 | 1900544;1900544;1900544;1900544;1900544 | 1179840;1179840;1179840;1179840;1179840 | 1233440;1209024;1191168;1210432;1197248 | |
11 | vgg4_relu4_fwd | Activation | [1,256,56,56] | 218.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.33 | 1605632 | 373.33 | 1630016.00 | 75.90 | 0.98 | 253.53 | true | 0.757616;0.758006;0.760075;0.761311;0.758089 | 1605632;1605632;1605632;1605632;1605632 | 544;288;2336;288;288 | 1641600;1650464;1613824;1571360;1634624 | |
12 | vgg4_conv5_fwd | Convolution | [1,256,56,56] | 209119.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 182.67 | 1962082304 | 10012064.00 | 6300714.67 | 21.60 | 120.28 | 10741.31 | false | 0.215919;0.214247;0.219216;0.214606;0.216702 | 1962082304;1962082304;1962082304;1962082304;1962082304 | 6171200;6092480;6385600;6345344;6404704 | 10076768;10057824;9968608;10009760;9962784 | |
12 | vgg4_conv5_fwd | Convolution | [1,256,56,56] | 209119.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11.67 | 4014080 | 2538730.67 | 242784.00 | 72.00 | 1.44 | 344.05 | true | 0.729168;0.717048;0.711729;0.715992;0.727039 | 4014080;4014080;4014080;4014080;4014080 | 2423072;2323616;2597056;2596064;2601088 | 290080;325408;213952;222432;215840 | |
12 | vgg4_conv5_fwd | Convolution | [1,256,56,56] | 209119.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 11.00 | 3801088 | 2359466.67 | 3511712.00 | 30.80 | 0.65 | 345.55 | true | 0.305121;0.307616;0.307389;0.312703;0.307559 | 3801088;3801088;3801088;3801088;3801088 | 3563712;3533856;3471872;3527584;3473696 | 2359680;2359552;2359360;2359488;2359296 | |
13 | vgg4_relu5_fwd | Activation | [1,256,56,56] | 224 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.00 | 1605632 | 288.00 | 1663776.00 | 76.00 | 0.96 | 229.38 | true | 0.761027;0.758831;0.760362;0.760372;0.757821 | 1605632;1605632;1605632;1605632;1605632 | 288;288;288;288;288 | 1685248;1669408;1637856;1650304;1671616 | |
14 | vgg4_conv6_fwd | Convolution | [1,256,56,56] | 208278.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 183.00 | 1962082304 | 9919882.67 | 6131146.67 | 21.40 | 122.24 | 10721.76 | false | 0.214553;0.215769;0.214064;0.214224;0.214413 | 1962082304;1962082304;1962082304;1962082304;1962082304 | 9902624;10045728;9958432;9898400;9898592 | 6154656;5864288;6334880;6425600;5903904 | |
14 | vgg4_conv6_fwd | Convolution | [1,256,56,56] | 208278.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 11.67 | 4014080 | 2280586.67 | 309173.33 | 71.70 | 1.55 | 344.05 | true | 0.712438;0.722498;0.716183;0.711468;0.726784 | 4014080;4014080;4014080;4014080;4014080 | 299904;409568;220320;225280;402336 | 2329408;2000960;2511392;2599904;1990368 | |
14 | vgg4_conv6_fwd | Convolution | [1,256,56,56] | 208278.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 10.33 | 3801088 | 2359317.33 | 3415189.33 | 30.20 | 0.66 | 367.86 | true | 0.304466;0.296475;0.306402;0.300784;0.299853 | 3801088;3801088;3801088;3801088;3801088 | 2359296;2359296;2359360;2359296;2359360 | 3409312;3439744;3432320;3397504;3403936 | |
15 | vgg4_relu6_fwd | Activation | [1,256,56,56] | 226.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.67 | 1605632 | 288.00 | 1652650.67 | 76.10 | 0.97 | 240.83 | true | 0.759201;0.764719;0.760336;0.760952;0.762407 | 1605632;1605632;1605632;1605632;1605632 | 288;288;288;288;288 | 1652160;1659104;1591392;1646688;1665792 | |
16 | vgg4_pool2_fwd | Pooling | [1,256,56,56] | 1648.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 6.33 | 200704 | 0.00 | 1059157.33 | 49.30 | 0.19 | 31.69 | true | 0.493636;0.492331;0.493123;0.492632;0.492570 | 200704;200704;200704;200704;200704 | 0;0;0;0;5376 | 1056992;1055936;1059616;1060864;1063232 | |
17 | vgg4_conv7_fwd | Convolution | [1,256,28,28] | 101245.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 121.67 | 1121189888 | 15350976.00 | 4050538.67 | 21.30 | 57.79 | 9215.23 | false | 0.212692;0.213124;0.212880;0.213029;0.212424 | 1121189888;1121189888;1121189888;1121189888;1121189888 | 14271168;14905760;15673760;16358208;15473408 | 4013088;3934752;4119872;4090624;4047904 | |
17 | vgg4_conv7_fwd | Convolution | [1,256,28,28] | 101245.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 19.00 | 7602176 | 4750336.00 | 7533312.00 | 46.90 | 0.62 | 400.11 | true | 0.467317;0.469733;0.470545;0.466369;0.468633 | 7602176;7602176;7602176;7602176;7602176 | 4764864;4748160;4747328;4751552;4751296 | 7558208;7644256;7492352;7504096;7537632 | |
17 | vgg4_conv7_fwd | Convolution | [1,256,28,28] | 101245.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.67 | 2007040 | 110858.67 | 85.33 | 70.80 | 18.09 | 261.78 | false | 0.707656;0.708697;0.709775;0.707616;0.706287 | 2007040;2007040;2007040;2007040;2007040 | 70816;115552;108672;113824;110080 | 0;224;128;128;0 | |
18 | vgg4_relu7_fwd | Activation | [1,512,28,28] | 112.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 802816 | 288.00 | 0.00 | 67.00 | 2787.56 | 141.67 | false | 0.669013;0.669107;0.670041;0.669435;0.672341 | 802816;802816;802816;802816;802816 | 0;0;0;0;0 | 288;288;288;288;288 | |
19 | vgg4_conv8_fwd | Convolution | [1,512,28,28] | 201649 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 234.00 | 2236874752 | 38858165.33 | 4068448.00 | 21.40 | 52.11 | 9559.29 | false | 0.214095;0.213252;0.213725;0.213252;0.214082 | 2236874752;2236874752;2236874752;2236874752;2236874752 | 38894560;37764416;39449632;39191552;38488384 | 4043808;4072096;4120960;3938624;4089440 | |
19 | vgg4_conv8_fwd | Convolution | [1,512,28,28] | 201649 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 35.00 | 15204352 | 9564672.00 | 14857557.33 | 58.40 | 0.62 | 434.41 | true | 0.584138;0.583374;0.583051;0.583408;0.584706 | 15204352;15204352;15204352;15204352;15204352 | 9575104;9567104;9563072;9563840;9561600 | 14850912;14856640;14804032;14962272;14865120 | |
19 | vgg4_conv8_fwd | Convolution | [1,512,28,28] | 201649 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 7.67 | 2007040 | 372576.00 | 7146.67 | 72.00 | 5.29 | 261.78 | true | 0.736419;0.717013;0.726759;0.717083;0.713930 | 2007040;2007040;2007040;2007040;2007040 | 377920;374976;364832;381376;364384 | 7360;7424;6656;8672;5920 | |
20 | vgg4_relu8_fwd | Activation | [1,512,28,28] | 121.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 6.00 | 802816 | 288.00 | 0.00 | 67.10 | 2787.56 | 133.80 | false | 0.669275;0.671234;0.670644;0.671178;0.670310 | 802816;802816;802816;802816;802816 | 288;288;288;288;288 | 0;0;0;0;0 | |
21 | vgg4_conv9_fwd | Convolution | [1,512,28,28] | 200680 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 234.00 | 2236874752 | 37284085.33 | 4026922.67 | 21.40 | 54.15 | 9559.29 | false | 0.214499;0.213255;0.212986;0.214240;0.213863 | 2236874752;2236874752;2236874752;2236874752;2236874752 | 36188544;35628192;38123328;37540384;38636224 | 4029440;4026688;4024640;4037120;3928576 | |
21 | vgg4_conv9_fwd | Convolution | [1,512,28,28] | 200680 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 35.00 | 15204352 | 9591189.33 | 14926240.00 | 57.90 | 0.62 | 434.41 | true | 0.580853;0.580659;0.578340;0.577963;0.577243 | 15204352;15204352;15204352;15204352;15204352 | 14940096;14911712;14888000;14926912;14987584 | 9580800;9591744;9598528;9592704;9589120 | |
21 | vgg4_conv9_fwd | Convolution | [1,512,28,28] | 200680 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 8.00 | 2007040 | 373205.33 | 7189.33 | 71.90 | 5.28 | 250.88 | true | 0.722224;0.724570;0.716545;0.718592;0.714362 | 2007040;2007040;2007040;2007040;2007040 | 6720;7360;7488;5920;7936 | 370848;372032;376736;364000;378624 | |
22 | vgg4_relu9_fwd | Activation | [1,512,28,28] | 120.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.67 | 802816 | 288.00 | 0.00 | 67.10 | 2787.56 | 141.67 | false | 0.671340;0.671462;0.670005;0.671126;0.671083 | 802816;802816;802816;802816;802816 | 288;288;5920;288;288 | 0;0;0;0;0 | |
23 | vgg4_pool3_fwd | Pooling | [1,512,28,28] | 823.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 5.67 | 100352 | 0.00 | 0.00 | 31.10 | 0.00 | 17.71 | true | 0.310643;0.310498;0.310681;0.310567;0.310804 | 100352;100352;100352;100352;100352 | 0;1536;0;0;0 | 0;0;0;0;0 | |
24 | vgg4_conv10_fwd | Convolution | [1,512,14,14] | 51944.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 123.00 | 559218688 | 17178912.00 | 3452768.00 | 12.50 | 27.10 | 4546.49 | false | 0.124958;0.124960;0.124958;0.124959;0.124960 | 559218688;559218688;559218688;559218688;559218688 | 3457248;3441120;3519584;3425792;3459936 | 17178912;17178912;17178912;17178912;17178912 | |
24 | vgg4_conv10_fwd | Convolution | [1,512,14,14] | 51944.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 35.00 | 15204352 | 9574912.00 | 15420938.67 | 58.40 | 0.61 | 434.41 | true | 0.583305;0.584540;0.584390;0.587153;0.584268 | 15204352;15204352;15204352;15204352;15204352 | 9567552;9565056;9577600;9579584;9580352 | 15416288;15433024;15353664;15445824;15413504 | |
24 | vgg4_conv10_fwd | Convolution | [1,512,14,14] | 51944.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 501760 | 2336.00 | 0.00 | 55.20 | 214.79 | 100.35 | false | 0.546698;0.552017;0.559505;0.552177;0.551904 | 501760;501760;501760;501760;501760 | 2336;2336;2336;2336;2336 | 0;0;0;0;0 | |
25 | vgg4_relu10_fwd | Activation | [1,512,14,14] | 36.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 200704 | 970.67 | 0.00 | 50.00 | 206.77 | 43.00 | false | 0.498703;0.500960;0.499774;0.500233;0.497598 | 200704;200704;200704;200704;200704 | 0;0;0;0;0 | 288;2336;2336;288;288 | |
26 | vgg4_conv11_fwd | Convolution | [1,512,14,14] | 51948.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 123.00 | 559218688 | 17178912.00 | 3510880.00 | 12.50 | 27.03 | 4546.49 | false | 0.124958;0.124957;0.124958;0.124957;0.124960 | 559218688;559218688;559218688;559218688;559218688 | 17178912;17178912;17178912;17178912;17178912 | 3559392;3500768;3522784;3499616;3509088 | |
26 | vgg4_conv11_fwd | Convolution | [1,512,14,14] | 51948.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 34.67 | 15204352 | 9561344.00 | 13670624.00 | 57.90 | 0.65 | 438.58 | true | 0.578526;0.582256;0.577001;0.577842;0.580410 | 15204352;15204352;15204352;15204352;15204352 | 9583424;9557952;9549888;9566016;9560064 | 13622112;13680736;13658720;13681632;13672416 | |
26 | vgg4_conv11_fwd | Convolution | [1,512,14,14] | 51948.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 501760 | 2336.00 | 0.00 | 55.20 | 214.79 | 100.35 | false | 0.544219;0.554579;0.554468;0.546594;0.555730 | 501760;501760;501760;501760;501760 | 2336;2336;2336;2336;2336 | 0;0;0;0;0 | |
27 | vgg4_relu11_fwd | Activation | [1,512,14,14] | 34.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.33 | 200704 | 288.00 | 0.00 | 50.10 | 696.89 | 46.32 | false | 0.498020;0.536563;0.501258;0.501724;0.501175 | 200704;200704;200704;200704;200704 | 288;288;288;288;288 | 0;0;0;0;0 | |
28 | vgg4_conv12_fwd | Convolution | [1,512,14,14] | 51978.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 123.00 | 559218688 | 17178997.33 | 3350837.33 | 12.50 | 27.24 | 4546.49 | false | 0.124958;0.124959;0.124959;0.124958;0.124957 | 559218688;559218688;559218688;559218688;559218688 | 17179168;17178912;17185568;17178912;17178912 | 3349088;3396832;3343712;3359328;3344096 | |
28 | vgg4_conv12_fwd | Convolution | [1,512,14,14] | 51978.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 34.00 | 15204352 | 9587733.33 | 13823690.67 | 57.60 | 0.65 | 447.19 | true | 0.577299;0.575082;0.572726;0.578302;0.574225 | 15204352;15204352;15204352;15204352;15204352 | 9585664;9570752;9588480;9589056;9603712 | 13818560;13784544;13837888;13814880;13837632 | |
28 | vgg4_conv12_fwd | Convolution | [1,512,14,14] | 51978.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)0, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 0>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 5.00 | 501760 | 2336.00 | 0.00 | 56.10 | 214.79 | 100.35 | false | 0.560633;0.553934;0.568126;0.560924;0.560034 | 501760;501760;501760;501760;501760 | 0;0;0;0;0 | 2336;2336;2336;2336;2336 | |
29 | vgg4_relu12_fwd | Activation | [1,512,14,14] | 34.667 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.67 | 200704 | 288.00 | 0.00 | 50.00 | 696.89 | 43.00 | false | 0.499879;0.500561;0.498805;0.500375;0.501766 | 200704;200704;200704;200704;200704 | 288;288;288;288;288 | 0;0;0;0;0 | |
30 | vgg4_pool4_fwd | Pooling | [1,512,14,14] | 259.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 4.67 | 25088 | 0.00 | 0.00 | 10.80 | 0.00 | 5.38 | true | 0.108269;0.108240;0.108287;0.108287;0.108286 | 25088;25088;25088;25088;25088 | 0;0;0;0;0 | 0;0;0;0;0 | |
31 | vgg4_dense0_fwd | FullyConnected | [1,512,7,7] | 176781 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void gemv2T_kernel_val<long, long, float, float, float, 128, 16, 2, 2, false, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>, float, float) | 491.00 | 224849920 | 235613888.00 | 293685.33 | 39.90 | 0.95 | 457.94 | true | 0.399143;0.398866;0.399075;0.398893;0.399347 | 224849920;224849920;224849920;224849920;224849920 | 320576;296576;272448;287872;296608 | 263408384;237754496;224885120;231349696;237737472 | |
31 | vgg4_dense0_fwd | FullyConnected | [1,512,7,7] | 176781 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 4.00 | 4096 | 19498.67 | 0.00 | 12.20 | 0.21 | 1.02 | true | 0.122656;0.122258;0.122286;0.122369;0.122394 | 4096;4096;4096;4096;4096 | 19904;20672;19008;18976;19584 | 0;0;0;0;0 | |
32 | vgg4_dense0_relu_fwd | Activation | [1,4096] | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 8192 | 288.00 | 0.00 | 31.10 | 28.44 | 2.05 | false | 0.308098;0.313916;0.311853;0.296024;0.314685 | 8192;8192;8192;8192;8192 | 288;288;288;288;288 | 0;0;0;0;0 | |
33 | vgg4_dropout0_fwd | Dropout | [1,4096] | 9.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::identity, 1>, float*, float*>(int, float*, float*) | 3.00 | 0 | 0.00 | 0.00 | 12.30 | 0.00 | 0.00 | true | 0.122511;0.122495;0.122509;0.122529;0.122526 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 | |
34 | vgg4_dense1_fwd | FullyConnected | [1,4096] | 27275 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void gemv2T_kernel_val<long, long, float, float, float, 128, 16, 2, 2, false, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>, float, float) | 81.00 | 36761600 | 67122240.00 | 18858.67 | 39.50 | 0.55 | 453.85 | true | 0.395235;0.394583;0.394969;0.395020;0.395119 | 36761600;36761600;36761600;36761600;36761600 | 18752;19136;18752;18784;19040 | 67122240;67122240;67122752;67122240;67122240 | |
34 | vgg4_dense1_fwd | FullyConnected | [1,4096] | 27275 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 4.00 | 4096 | 16576.00 | 0.00 | 12.20 | 0.25 | 1.02 | true | 0.122159;0.122194;0.122192;0.122138;0.122087 | 4096;4096;4096;4096;4096 | 16576;16576;16576;18368;16576 | 0;0;0;0;0 | |
35 | vgg4_dense1_relu_fwd | Activation | [1,4096] | 14.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void op_generic_tensor_kernel<2, float, float, float, 256, (cudnnGenericOp_t)8, (cudnnNanPropagation_t)0, (cudnnDimOrder_t)0, 1>(cudnnTensorStruct, float*, cudnnTensorStruct, float const*, cudnnTensorStruct, float const*, float, float, float, float, dimArray, reducedDivisorArray, bool) | 4.00 | 8192 | 288.00 | 0.00 | 30.80 | 28.44 | 2.05 | false | 0.302199;0.311627;0.299349;0.312367;0.311425 | 8192;8192;8192;8192;8192 | 288;288;288;288;288 | 0;0;0;0;0 | |
36 | vgg4_dropout1_fwd | Dropout | [1,4096] | 8.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mxnet::op::mxnet_op::mxnet_generic_kernel<mxnet::op::mxnet_op::op_with_req<mxnet::op::mshadow_op::identity, 1>, float*, float*>(int, float*, float*) | 3.00 | 0 | 0.00 | 0.00 | 12.20 | 0.00 | 0.00 | true | 0.122341;0.122279;0.122367;0.122357;0.122350 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 | |
37 | vgg4_dense2_fwd | FullyConnected | [1,4096] | 6843.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void gemv2T_kernel_val<int, int, float, float, float, 128, 16, 4, 4, false, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>, float, float) | 25.00 | 8893000 | 16519978.67 | 18709.33 | 28.20 | 0.54 | 355.72 | true | 0.281047;0.282381;0.281603;0.281506;0.281474 | 8893000;8893000;8893000;8893000;8893000 | 18496;18496;18752;19264;18880 | 16521088;16520832;16520128;16518976;16518720 | |
37 | vgg4_dense2_fwd | FullyConnected | [1,4096] | 6843.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void splitKreduce_kernel<float, float, float>(cublasSplitKParams<float>, float const*, float const*, float*, float const*, float const*) | 4.00 | 4000 | 160.00 | 0.00 | 6.20 | 25.00 | 1.00 | false | 0.062297;0.062299;0.062293;0.062308;0.062302 | 4000;4000;4000;4000;4000 | 0;0;0;0;0 | 160;160;160;160;160 | |
37 | vgg4_dense2_fwd | FullyConnected | [1,4096] | 6843.333 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | void mshadow::cuda::MapPlanKernel<mshadow::sv::plusto, 8, mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float> >(mshadow::expr::Plan<mshadow::Tensor<mshadow::gpu, 2, float>, float>, int, mshadow::Shape<2>, mshadow::expr::Plan<mshadow::expr::Broadcast1DExp<mshadow::Tensor<mshadow::gpu, 1, float>, float, 2, 1>, float>) | 3.00 | 1000 | 4224.00 | 0.00 | 12.20 | 0.24 | 0.33 | true | 0.122165;0.122099;0.122186;0.122252;0.122258 | 1000;1000;1000;1000;1000 | 0;0;0;0;0 | 4224;4224;4224;4224;4224 |
Showing 1 to 68 of 68 entries