GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | InceptionV4/InceptionV4/Conv2d_1a_3x3/BatchNorm/batchnorm/mul-0-TransposeNHWCToNCHW-LayoutOptimizer | Transpose | [[1 3 299 299]] | 84.333 | 1072896 | 1072896 | 172752384 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 8.00 | 0 | 7008.00 | 65696.00 | 84.10 | 0.00 | 0.00 | true | 0.842810;0.844193;0.844762;0.835041;0.831059 | 0;0;0;0;0 | 7008;7008;6944;7008;7072 | 65984;60192;64192;66912;67648 |
2 | InceptionV4/InceptionV4/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 149 149]] | 208.333 | 2841856 | 2845440 | 174521344 | GPU_0_bfc | 3584 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 19.00 | 46192416 | 1222474.67 | 3397269.33 | 18.90 | 10.00 | 2431.18 | true | 0.188339;0.188888;0.188332;0.189537;0.189820 | 46192416;46192416;46192416;46192416;46192416 | 1207328;1212000;1188544;1248096;1249856 | 3385952;3378112;3352096;3438080;3427744 |
2 | InceptionV4/InceptionV4/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 149 149]] | 208.333 | 2841856 | 2845440 | 174521344 | GPU_0_bfc | 3584 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 3.00 | 0 | 3712.00 | 0.00 | 37.60 | 0.00 | 0.00 | true | 0.376751;0.376176;0.376319;0.376201;0.376176 | 0;0;0;0;0 | 3712;3712;3712;3712;3712 | 0;0;0;0;0 |
3 | InceptionV4/InceptionV4/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 32 149 149]] | 41 | 2841856 | 0 | 173448448 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.00 | 710432 | 640.00 | 344192.00 | 78.90 | 2.06 | 118.41 | true | 0.783121;0.786196;0.789072;0.791481;0.790943 | 710432;710432;710432;710432;710432 | 640;640;640;640;640 | 348480;351744;341408;337120;342688 |
4 | InceptionV4/InceptionV4/Conv2d_1a_3x3/Relu | Relu | [[1 32 149 149]] | 24.667 | 2841856 | 0 | 173448448 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 5.00 | 0 | 768.00 | 200320.00 | 68.70 | 0.00 | 0.00 | true | 0.687750;0.685543;0.684034;0.688122;0.687145 | 0;0;0;0;0 | 768;768;768;768;768 | 199200;199936;205088;199680;201344 |
5 | InceptionV4/InceptionV4/Conv2d_2a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 147 147]] | 169.333 | 2766080 | 2905600 | 176214528 | GPU_0_bfc | 139520 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 38.33 | 215183360 | 572629.33 | 3489696.00 | 22.70 | 52.97 | 5613.53 | false | 0.224587;0.227328;0.228719;0.227031;0.226382 | 215183360;215183360;215183360;215183360;215183360 | 377952;705184;620800;534432;562656 | 3279264;3658624;3529216;3448096;3491776 |
5 | InceptionV4/InceptionV4/Conv2d_2a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 147 147]] | 169.333 | 2766080 | 2905600 | 176214528 | GPU_0_bfc | 139520 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 59392 | 6144.00 | 41578.67 | 6.20 | 1.24 | 9.90 | true | 0.062432;0.062428;0.062431;0.062432;0.062434 | 59392;59392;59392;59392;59392 | 7680;6144;6144;6144;6144 | 41280;41024;41664;41792;42432 |
5 | InceptionV4/InceptionV4/Conv2d_2a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 147 147]] | 169.333 | 2766080 | 2905600 | 176214528 | GPU_0_bfc | 139520 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 36864.00 | 725.33 | 42.20 | 0.00 | 0.00 | true | 0.423293;0.421596;0.420777;0.422114;0.421787 | 0;0;0;0;0 | 38400;36864;36864;36864;36864 | 1920;768;640;768;640 |
6 | InceptionV4/InceptionV4/Conv2d_2a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 32 147 147]] | 27.667 | 2766080 | 0 | 173372672 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.00 | 691488 | 277.33 | 146389.33 | 78.10 | 4.71 | 115.25 | true | 0.785012;0.779648;0.781788;0.780113;0.777550 | 691488;691488;691488;691488;691488 | 256;320;256;384;256 | 153600;141920;117824;143648;158240 |
7 | InceptionV4/InceptionV4/Conv2d_2a_3x3/Relu | Relu | [[1 32 147 147]] | 22 | 2766080 | 0 | 173372672 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 5.00 | 0 | 0.00 | 189386.67 | 67.60 | 0.00 | 0.00 | true | 0.677664;0.674303;0.675467;0.674519;0.676841 | 0;0;0;0;0 | 0;0;256;0;0 | 188224;201344;198592;180960;181344 |
8 | InceptionV4/InceptionV4/Conv2d_2b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 147 147]] | 147 | 5531904 | 5810688 | 178904576 | GPU_0_bfc | 278784 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 50.67 | 430366720 | 2113450.67 | 7659210.67 | 23.00 | 44.04 | 8494.02 | false | 0.227016;0.229228;0.230018;0.232435;0.229387 | 430366720;430366720;430366720;430366720;430366720 | 2153856;2079872;2303040;2003200;2106624 | 7723968;7590112;7766016;7607840;7645824 |
8 | InceptionV4/InceptionV4/Conv2d_2b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 147 147]] | 147 | 5531904 | 5810688 | 178904576 | GPU_0_bfc | 278784 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 73728.00 | 26325.33 | 42.30 | 0.00 | 0.00 | true | 0.424938;0.422119;0.424590;0.422849;0.422740 | 0;0;0;0;0 | 73728;73728;73728;73728;73728 | 26496;26624;25856;24320;30592 |
8 | InceptionV4/InceptionV4/Conv2d_2b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 147 147]] | 147 | 5531904 | 5810688 | 178904576 | GPU_0_bfc | 278784 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 118784 | 0.00 | 73941.33 | 6.20 | 1.61 | 39.59 | true | 0.062320;0.062309;0.062306;0.062309;0.062310 | 118784;118784;118784;118784;118784 | 0;0;0;0;0 | 74112;73600;73984;74112;73728 |
9 | InceptionV4/InceptionV4/Conv2d_2b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 64 147 147]] | 31.333 | 5531904 | 0 | 176138496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 10.33 | 1382976 | 1617472.00 | 3343061.33 | 86.20 | 0.28 | 133.84 | true | 0.860099;0.842894;0.872895;0.867984;0.859230 | 1382976;1382976;1382976;1382976;1382976 | 3350464;3424512;3348160;3330560;3296448 | 1707552;1614144;1610656;1546400;1627616 |
10 | InceptionV4/InceptionV4/Conv2d_2b_3x3/Relu | Relu | [[1 64 147 147]] | 24.333 | 5531904 | 0 | 176138496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 7.00 | 0 | 96.00 | 4955552.00 | 75.40 | 0.00 | 0.00 | true | 0.762738;0.754482;0.754980;0.752859;0.753478 | 0;0;0;0;0 | 96;96;352;96;96 | 4842976;4690048;5021536;5002144;5029920 |
11 | InceptionV4/InceptionV4/Mixed_3a/Branch_1/Conv2d_0a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 73 73]] | 179.333 | 2046464 | 2267648 | 178184960 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 96.33 | 591515232 | 411040.00 | 2624981.33 | 15.40 | 194.83 | 6140.32 | false | 0.158096;0.151956;0.150305;0.155705;0.153529 | 591515232;591515232;591515232;591515232;591515232 | 302560;447904;353504;431712;490976 | 2616736;2649120;2583840;2609088;2666144 |
11 | InceptionV4/InceptionV4/Mixed_3a/Branch_1/Conv2d_0a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 73 73]] | 179.333 | 2046464 | 2267648 | 178184960 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 221184.00 | 485898.67 | 45.10 | 0.00 | 0.00 | true | 0.450624;0.451920;0.452720;0.449651;0.449816 | 0;0;0;0;0 | 222976;221184;221184;221184;221184 | 534720;512416;468000;475520;469760 |
12 | InceptionV4/InceptionV4/Mixed_3a/Branch_0/MaxPool_0a_3x3/MaxPool | MaxPool | [[1 64 73 73]] | 63.667 | 1364224 | 1364224 | 179549184 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 18.00 | 341056 | 4553280.00 | 2985514.67 | 66.50 | 0.05 | 18.95 | true | 0.669430;0.663422;0.666603;0.659273;0.664355 | 341056;341056;341056;341056;341056 | 3068352;3013792;2987008;2951200;2955744 | 4600512;4614368;4488800;4494240;4565088 |
13 | InceptionV4/InceptionV4/Mixed_3a/Branch_1/Conv2d_0a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 73 73]] | 27.667 | 2046464 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.33 | 511584 | 1284074.67 | 733749.33 | 72.30 | 0.25 | 80.78 | true | 0.721462;0.725955;0.722186;0.723595;0.722165 | 511584;511584;511584;511584;511584 | 722080;725312;744032;743392;732544 | 1302368;1268736;1296640;1285824;1269760 |
14 | InceptionV4/InceptionV4/Mixed_3a/Branch_1/Conv2d_0a_3x3/Relu | Relu | [[1 96 73 73]] | 19.333 | 2046464 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 96.00 | 346602.67 | 68.70 | 0.00 | 0.00 | true | 0.710312;0.675968;0.675070;0.715521;0.674939 | 0;0;0;0;0 | 96;96;96;96;96 | 351072;343456;345856;349696;344256 |
16 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 116 | 1364224 | 1405184 | 175381504 | GPU_0_bfc | 40960 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 27.00 | 109786176 | 208885.33 | 1681568.00 | 11.30 | 58.07 | 4066.15 | false | 0.113872;0.112676;0.113516;0.112665;0.113240 | 109786176;109786176;109786176;109786176;109786176 | 198560;207680;209440;209536;217856 | 1694912;1675936;1682496;1686272;1640576 |
16 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 116 | 1364224 | 1405184 | 175381504 | GPU_0_bfc | 40960 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 41045.33 | 84042.67 | 41.90 | 0.00 | 0.00 | true | 0.418217;0.418711;0.419315;0.416621;0.419581 | 0;0;0;0;0 | 41216;41216;40960;40960;40960 | 81920;83264;83584;85280;87936 |
17 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 111.333 | 2046464 | 2087424 | 177427968 | GPU_0_bfc | 40960 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 27.00 | 109786176 | 30336.00 | 1461258.67 | 11.00 | 73.60 | 4066.15 | false | 0.110428;0.110672;0.109792;0.109167;0.111553 | 109786176;109786176;109786176;109786176;109786176 | 34464;37408;27264;26176;29280 | 1472160;1462720;1447296;1456960;1464096 |
17 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 111.333 | 2046464 | 2087424 | 177427968 | GPU_0_bfc | 40960 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 41045.33 | 3125.33 | 41.30 | 0.00 | 0.00 | true | 0.411627;0.418260;0.412741;0.413826;0.413179 | 0;0;0;0;0 | 40960;40960;40960;41216;44032 | 4768;2592;2848;3616;2912 |
18 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 73 73]] | 26.667 | 1364224 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 341056 | 186325.33 | 867637.33 | 58.20 | 0.32 | 68.21 | true | 0.616829;0.569620;0.571005;0.603990;0.566997 | 341056;341056;341056;341056;341056 | 198400;178688;179584;186624;192768 | 874496;866112;869472;866304;867136 |
19 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 73 73]] | 20.667 | 2046464 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 341056 | 256.00 | 618698.67 | 52.90 | 0.55 | 68.21 | true | 0.529622;0.528344;0.529410;0.530475;0.528662 | 341056;341056;341056;341056;341056 | 556064;601152;633280;637984;621664 | 256;256;256;256;256 |
20 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 73 73]] | 19.667 | 1364224 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 132106.67 | 60.50 | 0.00 | 0.00 | true | 0.605878;0.606022;0.605539;0.604995;0.603214 | 0;0;0;0;0 | 111392;125248;131328;146368;139744 | 0;0;0;0;0 |
21 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 64 73 73]] | 18 | 2046464 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 48384.00 | 60.20 | 0.00 | 0.00 | true | 0.600472;0.600454;0.603491;0.603538;0.602106 | 0;0;0;0;0 | 0;0;0;1536;0 | 55872;47584;50432;44320;47136 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 306 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 12.67 | 41648640 | 12878.22 | 669400.89 | 4.40 | 61.04 | 3287.96 | false | 0.043051;0.043186;0.043192;0.044870;0.044984;0.044979;0.047522;0.047719;0.047718;0.043634;0.043722;0.043732;0.043636;0.043798;0.043799 | 41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640 | 18528;14656;1888;18528;14560;1824;18528;15328;1792;18528;14816;3136;18528;14176;2176 | 790528;533088;674432;792640;529792;673888;790304;540640;681504;791552;525760;663296;790560;530752;676928 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 306 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.56 | 1663232 | 20828.44 | 609671.11 | 2.80 | 2.64 | 132.47 | true | 0.028403;0.028434;0.028392;0.028399;0.028421;0.028362;0.028405;0.028424;0.028361;0.028407;0.028411;0.028375;0.028402;0.028396;0.028379 | 1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232 | 19104;31936;13824;19104;32736;13440;19104;32544;13248;19104;32320;13632;19104;32256;13920 | 620832;841824;331904;630176;843168;324064;630176;842464;321952;629280;848480;329984;630400;844352;327328 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 306 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.33 | 1663232 | 20828.44 | 609671.11 | 2.80 | 2.64 | 134.86 | true | 0.028403;0.028434;0.028392;0.028399;0.028421;0.028362;0.028405;0.028424;0.028361;0.028407;0.028411;0.028375;0.028402;0.028396;0.028379 | 1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232 | 19104;31936;13824;19104;32736;13440;19104;32544;13248;19104;32320;13632;19104;32256;13920 | 620832;841824;331904;630176;843168;324064;630176;842464;321952;629280;848480;329984;630400;844352;327328 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 306 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.11 | 1663232 | 20828.44 | 609671.11 | 2.80 | 2.64 | 137.33 | true | 0.028403;0.028434;0.028392;0.028399;0.028421;0.028362;0.028405;0.028424;0.028361;0.028407;0.028411;0.028375;0.028402;0.028396;0.028379 | 1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232 | 19104;31936;13824;19104;32736;13440;19104;32544;13248;19104;32320;13632;19104;32256;13920 | 620832;841824;331904;630176;843168;324064;630176;842464;321952;629280;848480;329984;630400;844352;327328 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 306 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.00 | 1458176 | 14165.33 | 281269.33 | 2.50 | 4.94 | 121.51 | true | 0.024923;0.024923;0.024925;0.024923;0.024922 | 1458176;1458176;1458176;1458176;1458176 | 15360;13824;14080;14336;14080 | 285920;281824;280416;281568;280160 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 306 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 11.78 | 41648640 | 12878.22 | 669400.89 | 4.40 | 61.04 | 3536.14 | false | 0.043051;0.043186;0.043192;0.044870;0.044984;0.044979;0.047522;0.047719;0.047718;0.043634;0.043722;0.043732;0.043636;0.043798;0.043799 | 41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640 | 18528;14656;1888;18528;14560;1824;18528;15328;1792;18528;14816;3136;18528;14176;2176 | 790528;533088;674432;792640;529792;673888;790304;540640;681504;791552;525760;663296;790560;530752;676928 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 306 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 11.78 | 41648640 | 12878.22 | 669400.89 | 4.40 | 61.04 | 3536.14 | false | 0.043051;0.043186;0.043192;0.044870;0.044984;0.044979;0.047522;0.047719;0.047718;0.043634;0.043722;0.043732;0.043636;0.043798;0.043799 | 41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640 | 790528;533088;674432;792640;529792;673888;790304;540640;681504;791552;525760;663296;790560;530752;676928 | 18528;14656;1888;18528;14560;1824;18528;15328;1792;18528;14816;3136;18528;14176;2176 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 306 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.11 | 1617550.222 | 9280.00 | 559800.89 | 2.80 | 2.84 | 264.69 | true | 0.028340;0.028118;0.028113;0.028325;0.028209;0.028116;0.028340;0.028145;0.028177;0.028342;0.028192;0.028119;0.028336;0.028078;0.028116 | 1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480 | 19232;6208;1024;19584;8000;1056;19712;8544;896;18624;9664;1760;19328;6656;4832 | 889312;496640;384480;889632;500608;383968;886816;497856;387584;888896;496928;389536;887424;494816;386368 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 306 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.56 | 1617550.222 | 9280.00 | 559800.89 | 2.80 | 2.84 | 291.14 | true | 0.028340;0.028118;0.028113;0.028325;0.028209;0.028116;0.028340;0.028145;0.028177;0.028342;0.028192;0.028119;0.028336;0.028078;0.028116 | 1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480 | 19232;6208;1024;19584;8000;1056;19712;8544;896;18624;9664;1760;19328;6656;4832 | 889312;496640;384480;889632;500608;383968;886816;497856;387584;888896;496928;389536;887424;494816;386368 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 306 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.44 | 1617550.222 | 9280.00 | 559800.89 | 2.80 | 2.84 | 297.13 | true | 0.028340;0.028118;0.028113;0.028325;0.028209;0.028116;0.028340;0.028145;0.028177;0.028342;0.028192;0.028119;0.028336;0.028078;0.028116 | 1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480 | 19232;6208;1024;19584;8000;1056;19712;8544;896;18624;9664;1760;19328;6656;4832 | 889312;496640;384480;889632;500608;383968;886816;497856;387584;888896;496928;389536;887424;494816;386368 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 306 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 114688.00 | 981.33 | 43.40 | 0.00 | 0.00 | true | 0.436088;0.429951;0.437425;0.433042;0.432344 | 0;0;0;0;0 | 114688;114688;114688;114688;114688 | 512;768;1152;2176;1024 |
23 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 140.667 | 1935872 | 3300096 | 175953152 | GPU_0_bfc | 1364224 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 38.00 | 299980800 | 1331680.00 | 1728789.33 | 19.40 | 98.02 | 7894.23 | false | 0.195031;0.201366;0.193068;0.190267;0.193534 | 299980800;299980800;299980800;299980800;299980800 | 1733984;1724864;1720096;1746560;1727520 | 1325664;1336352;1329696;1329536;1335808 |
23 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 140.667 | 1935872 | 3300096 | 175953152 | GPU_0_bfc | 1364224 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221525.33 | 108042.67 | 45.00 | 0.00 | 0.00 | true | 0.450713;0.447846;0.450730;0.449822;0.450773 | 0;0;0;0;0 | 221632;221440;221504;221440;221632 | 99360;111392;105984;114368;106752 |
23 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 140.667 | 1935872 | 3300096 | 175953152 | GPU_0_bfc | 1364224 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.67 | 356352 | 3477.33 | 232469.33 | 6.20 | 1.51 | 76.36 | true | 0.062400;0.062405;0.062413;0.062405;0.062413 | 356352;356352;356352;356352;356352 | 3072;3264;3584;3648;3584 | 244928;233056;230944;230336;233408 |
24 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 64 73 73]] | 26 | 1364224 | 0 | 173906688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 341056 | 116917.33 | 798698.67 | 62.00 | 0.37 | 68.21 | true | 0.621157;0.619390;0.617213;0.618023;0.622160 | 341056;341056;341056;341056;341056 | 117696;115616;119456;116576;116480 | 798048;792320;806496;793184;804864 |
25 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 71 71]] | 21 | 1935872 | 0 | 173906688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 483936 | 384.00 | 779413.33 | 68.60 | 0.62 | 103.69 | true | 0.687609;0.685192;0.688790;0.683189;0.686634 | 483936;483936;483936;483936;483936 | 384;1920;384;384;384 | 787232;805216;690656;802272;748736 |
26 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 64 73 73]] | 19 | 1364224 | 0 | 173906688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 1024.00 | 558560.00 | 60.50 | 0.00 | 0.00 | true | 0.603732;0.604611;0.604839;0.606307;0.606014 | 0;0;0;0;0 | 1024;1024;1024;1024;1024 | 566336;517888;626592;534720;574624 |
27 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 166.333 | 1364224 | 1510912 | 175270912 | GPU_0_bfc | 146688 | 0 | 0 | 0 | volta_scudnn_128x64_relu_small_nn_v1 | 64.67 | 308969472 | 24522.67 | 1487296.00 | 6.20 | 204.37 | 4777.85 | false | 0.062493;0.062493;0.062493;0.062493;0.062493 | 308969472;308969472;308969472;308969472;308969472 | 28480;24192;24128;24160;25216 | 1502144;1490848;1487616;1483424;1477856 |
27 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 166.333 | 1364224 | 1510912 | 175270912 | GPU_0_bfc | 146688 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 114688.00 | 70389.33 | 43.40 | 0.00 | 0.00 | true | 0.434836;0.432978;0.435868;0.433238;0.432050 | 0;0;0;0;0 | 114688;114688;114688;114688;114688 | 66688;72960;70688;67520;75680 |
27 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 166.333 | 1364224 | 1510912 | 175270912 | GPU_0_bfc | 146688 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 2560.00 | 16938.67 | 6.20 | 0.00 | 0.00 | true | 0.061868;0.061753;0.061778;0.061736;0.061776 | 0;0;0;0;0 | 2560;2560;2560;2560;2560 | 14592;16000;26624;16000;18816 |
28 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 73 73]] | 25 | 1364224 | 0 | 173906688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 341056 | 512.00 | 36565.33 | 54.00 | 9.20 | 68.21 | true | 0.539149;0.541542;0.539012;0.537920;0.540363 | 341056;341056;341056;341056;341056 | 512;2048;512;512;512 | 35584;35712;37376;38144;36608 |
29 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/Relu | Relu | [[1 64 73 73]] | 20 | 1364224 | 0 | 173906688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 298.67 | 60.60 | 0.00 | 0.00 | true | 0.605279;0.606887;0.605565;0.605979;0.605974 | 0;0;0;0;0 | 0;0;0;0;0 | 256;384;256;384;256 |
30 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 129.333 | 3410688 | 4246784 | 177317376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 36.00 | 299980800 | 306122.67 | 2558880.00 | 19.50 | 104.71 | 8332.80 | false | 0.196667;0.197841;0.193774;0.191342;0.195215 | 299980800;299980800;299980800;299980800;299980800 | 299136;323936;286272;301952;317280 | 2562720;2594400;2536448;2544672;2569248 |
30 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 129.333 | 3410688 | 4246784 | 177317376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 221184.00 | 37653.33 | 44.80 | 0.00 | 0.00 | true | 0.446842;0.449785;0.443231;0.450951;0.448391 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 35072;29056;33728;45184;44160 |
30 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 129.333 | 3410688 | 4246784 | 177317376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 356352 | 0.00 | 250229.33 | 6.20 | 1.42 | 118.78 | true | 0.062286;0.062296;0.062299;0.062299;0.062292 | 356352;356352;356352;356352;356352 | 0;0;0;2304;0 | 250848;256032;252128;247712;241888 |
31 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 71 71]] | 25.667 | 3410688 | 0 | 175953152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 483936 | 384.00 | 31104.00 | 68.50 | 15.37 | 96.79 | true | 0.682978;0.684735;0.683431;0.688707;0.688119 | 483936;483936;483936;483936;483936 | 384;384;384;384;384 | 36128;48320;21760;32928;24256 |
33 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 192 71 71]] | 24 | 3871488 | 0 | 174478080 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 6.00 | 0 | 422869.33 | 2404128.00 | 77.00 | 0.00 | 0.00 | true | 0.762593;0.769120;0.767336;0.774058;0.773377 | 0;0;0;0;0 | 2411424;2399296;2401664;2371168;2424288 | 417152;419072;429440;420096;434432 |
34 | InceptionV4/InceptionV4/Mixed_5a/Branch_1/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 192 35 35]] | 41.333 | 940800 | 940800 | 175418880 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 10.00 | 235200 | 8032.00 | 1285290.67 | 59.50 | 0.18 | 23.52 | true | 0.590348;0.595042;0.595037;0.595154;0.596583 | 235200;235200;235200;235200;235200 | 7776;8128;8192;7808;8160 | 1287616;1268640;1284544;1295456;1283712 |
35 | InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 35 35]] | 309.667 | 940800 | 2275328 | 176359680 | GPU_0_bfc | 1334528 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 203.00 | 849838080 | 3485248.00 | 2136554.67 | 6.20 | 151.17 | 4186.39 | false | 0.062498;0.062498;0.062498;0.062498;0.062498 | 849838080;849838080;849838080;849838080;849838080 | 3464416;3501536;3513600;3448864;3489792 | 2135552;2128960;2138816;2135296;2152128 |
35 | InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 35 35]] | 309.667 | 940800 | 2275328 | 176359680 | GPU_0_bfc | 1334528 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1327104.00 | 1257429.33 | 45.40 | 0.00 | 0.00 | true | 0.454560;0.454592;0.454058;0.454859;0.451828 | 0;0;0;0;0 | 1327104;1327104;1327104;1327104;1327104 | 1252192;1242240;1263104;1256992;1265312 |
35 | InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 35 35]] | 309.667 | 940800 | 2275328 | 176359680 | GPU_0_bfc | 1334528 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 2.00 | 0 | 352.00 | 3968.00 | 6.10 | 0.00 | 0.00 | true | 0.061127;0.061121;0.061142;0.061130;0.061123 | 0;0;0;0;0 | 352;352;352;352;352 | 3712;4480;3712;5632;3584 |
36 | InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 35 35]] | 25.667 | 940800 | 0 | 172488192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 235200 | 1088.00 | 768.00 | 59.70 | 126.72 | 58.80 | false | 0.598477;0.602252;0.595976;0.592975;0.595878 | 235200;235200;235200;235200;235200 | 1088;1088;1088;1088;1088 | 1536;1792;128;640;128 |
37 | InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 192 35 35]] | 19 | 940800 | 0 | 172488192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 96.00 | 298.67 | 68.40 | 0.00 | 0.00 | true | 0.675168;0.685835;0.687646;0.683116;0.684328 | 0;0;0;0;0 | 96;96;96;96;96 | 512;128;384;128;384 |
39 | InceptionV4/InceptionV4/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 384 35 35]] | 52.667 | 1881600 | 1881600 | 174369792 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 18.33 | 12156951 | 10240.00 | 1939744.00 | 62.10 | 6.23 | 663.12 | true | 0.621121;0.620669;0.618912;0.620392;0.621640 | 12156951;12156951;12156951;12156951;12156951 | 10240;10240;10240;10240;10240 | 1939616;1939264;1942656;1938432;1940352 |
40 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 119.333 | 313600 | 411904 | 174683392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 32.00 | 61420096 | 17472.00 | 373376.00 | 3.10 | 157.15 | 1919.38 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 61420096;61420096;61420096;61420096;61420096 | 17728;17472;17472;17472;17472 | 373088;372384;373376;373664;375328 |
40 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 119.333 | 313600 | 411904 | 174683392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 98304.00 | 597.33 | 42.70 | 0.00 | 0.00 | true | 0.425285;0.426892;0.427545;0.426611;0.429541 | 0;0;0;0;0 | 98304;98304;98304;98304;98304 | 640;512;640;512;640 |
41 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 105.333 | 313600 | 411904 | 174996992 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 61420096 | 0.00 | 322090.67 | 3.10 | 190.69 | 2456.80 | false | 0.031229;0.031229;0.031229;0.031230;0.031229 | 61420096;61420096;61420096;61420096;61420096 | 0;0;0;0;0 | 322048;322432;321792;322688;320512 |
41 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 105.333 | 313600 | 411904 | 174996992 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 98304.00 | 15658.67 | 42.80 | 0.00 | 0.00 | true | 0.429927;0.426729;0.428301;0.428353;0.428564 | 0;0;0;0;0 | 16384;15872;12672;15744;15360 | 98304;98304;98304;98304;98304 |
42 | InceptionV4/InceptionV4/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115.667 | 470528 | 617984 | 175467520 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 92130144 | 0.00 | 508629.33 | 4.60 | 181.13 | 2490.00 | false | 0.045825;0.046062;0.044962;0.044936;0.045932 | 92130144;92130144;92130144;92130144;92130144 | 0;0;0;0;0 | 510752;510240;506688;505760;508960 |
42 | InceptionV4/InceptionV4/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115.667 | 470528 | 617984 | 175467520 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147456.00 | 5162.67 | 43.50 | 0.00 | 0.00 | true | 0.436154;0.435562;0.434096;0.435556;0.434938 | 0;0;0;0;0 | 147456;147456;147456;147456;152576 | 5632;5504;4736;4736;5248 |
43 | InceptionV4/InceptionV4/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115.333 | 470528 | 617984 | 174056448 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 92130144 | 0.00 | 455701.33 | 4.50 | 202.17 | 2490.00 | false | 0.045309;0.044942;0.045655;0.045230;0.045138 | 92130144;92130144;92130144;92130144;92130144 | 455008;457600;446720;454720;457376 | 0;0;0;0;0 |
43 | InceptionV4/InceptionV4/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115.333 | 470528 | 617984 | 174056448 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147456.00 | 13258.67 | 43.30 | 0.00 | 0.00 | true | 0.431616;0.436973;0.434022;0.432502;0.433566 | 0;0;0;0;0 | 147456;147456;147456;147456;147456 | 10272;12064;14368;15008;13344 |
44 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 25 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 512.00 | 35498.67 | 45.30 | 2.18 | 19.60 | true | 0.459542;0.452928;0.451685;0.453324;0.452899 | 78400;78400;78400;78400;78400 | 39040;35200;32256;31616;44096 | 512;512;512;512;512 |
45 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 19.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 78400 | 256.00 | 97184.00 | 44.10 | 0.80 | 26.13 | true | 0.440375;0.440256;0.443248;0.439742;0.440876 | 78400;78400;78400;78400;78400 | 256;256;256;256;256 | 95456;96032;97152;100736;98368 |
46 | InceptionV4/InceptionV4/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 19.333 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 187285.33 | 53.60 | 0.63 | 29.40 | true | 0.537254;0.535933;0.535193;0.535762;0.530740 | 117600;117600;117600;117600;117600 | 384;10112;384;384;384 | 188640;186048;189376;187168;177600 |
47 | InceptionV4/InceptionV4/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 18.667 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 117600 | 384.00 | 33034.67 | 53.60 | 3.52 | 39.20 | true | 0.536030;0.537647;0.536041;0.536883;0.534578 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 30208;33440;35200;30464;38272 |
48 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 18.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.438398;0.438108;0.438777;0.438918;0.438801 | 0;0;0;0;0 | 0;0;256;0;0 | 0;0;0;0;0 |
49 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 18 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 426.67 | 43.60 | 0.00 | 0.00 | true | 0.436402;0.435600;0.436204;0.435927;0.436665 | 0;0;0;0;0 | 0;0;0;3328;0 | 0;384;512;384;512 |
50 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 119.333 | 470528 | 1411072 | 172645376 | GPU_0_bfc | 940544 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 22.00 | 99993600 | 5632.00 | 542677.33 | 12.50 | 182.37 | 4545.16 | false | 0.124670;0.124656;0.124656;0.124701;0.124670 | 99993600;99993600;99993600;99993600;99993600 | 7424;5888;5376;5632;5376 | 543360;534400;543072;551232;541600 |
50 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 119.333 | 470528 | 1411072 | 172645376 | GPU_0_bfc | 940544 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.00 | 356352 | 512.00 | 86314.67 | 6.20 | 4.10 | 71.27 | true | 0.062334;0.062333;0.062350;0.062338;0.062338 | 356352;356352;356352;356352;356352 | 512;3328;512;512;512 | 84864;90880;83840;88960;85120 |
50 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 119.333 | 470528 | 1411072 | 172645376 | GPU_0_bfc | 940544 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 221184.00 | 4362.67 | 44.10 | 0.00 | 0.00 | true | 0.442025;0.441722;0.440697;0.441037;0.441578 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 3456;4640;5632;4992;3328 |
51 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 112.667 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 20.00 | 99993600 | 0.00 | 311082.67 | 12.50 | 321.44 | 4999.68 | false | 0.124638;0.124636;0.124622;0.124627;0.124643 | 99993600;99993600;99993600;99993600;99993600 | 0;0;0;0;0 | 310784;311104;311456;311104;311040 |
51 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 112.667 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 221184.00 | 2560.00 | 45.10 | 0.00 | 0.00 | true | 0.450443;0.451067;0.450926;0.451402;0.450752 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 2048;2944;3456;1152;2688 |
51 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 112.667 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 356352 | 0.00 | 142410.67 | 6.20 | 2.50 | 118.78 | true | 0.062276;0.062280;0.062293;0.062277;0.062275 | 356352;356352;356352;356352;356352 | 143104;142304;141344;144480;141824 | 0;0;0;0;0 |
52 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 24 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 117600 | 384.00 | 298.67 | 54.60 | 172.27 | 39.20 | false | 0.545756;0.546587;0.545636;0.546557;0.542897 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 256;384;256;384;256 |
53 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 19 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 117600 | 384.00 | 640.00 | 53.70 | 114.84 | 32.07 | false | 0.537724;0.537152;0.537824;0.537299;0.535818 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 1408;640;512;768;512 |
54 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 18.667 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 59.80 | 0.00 | 0.00 | true | 0.597586;0.588181;0.598208;0.599524;0.597298 | 0;0;0;0;0 | 256;0;0;0;0 | 0;0;0;0;0 |
55 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118 | 470528 | 2019328 | 172959232 | GPU_0_bfc | 1548800 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 27.00 | 149022720 | 0.00 | 443669.33 | 12.50 | 335.89 | 5519.36 | false | 0.124734;0.124725;0.124725;0.124733;0.124723 | 149022720;149022720;149022720;149022720;149022720 | 0;0;0;0;7680 | 443296;445824;443232;444480;442208 |
55 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118 | 470528 | 2019328 | 172959232 | GPU_0_bfc | 1548800 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 331776.00 | 1194.67 | 44.60 | 0.00 | 0.00 | true | 0.444500;0.446995;0.446091;0.446207;0.446713 | 0;0;0;0;0 | 331776;331776;331776;336384;331776 | 1024;1152;1408;1152;1280 |
55 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118 | 470528 | 2019328 | 172959232 | GPU_0_bfc | 1548800 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 534528 | 0.00 | 43904.00 | 6.20 | 12.17 | 178.18 | true | 0.062257;0.062281;0.062270;0.062285;0.062264 | 534528;534528;534528;534528;534528 | 0;0;0;0;1536 | 44160;41984;44416;43136;45312 |
56 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 24 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 117600 | 384.00 | 0.00 | 54.60 | 306.25 | 35.28 | false | 0.544618;0.546438;0.546530;0.546042;0.546905 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 0;0;0;0;0 |
58 | InceptionV4/InceptionV4/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 384 35 35]] | 21.333 | 1881600 | 0 | 172488192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 412181.33 | 65.00 | 0.00 | 0.00 | true | 0.649576;0.649586;0.650358;0.648344;0.651502 | 0;0;0;0;0 | 413344;413248;411712;411584;410272 | 0;0;0;0;0 |
59 | InceptionV4/InceptionV4/Mixed_5c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 384 35 35]] | 52 | 1881600 | 1881600 | 174369792 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 20.00 | 10644816 | 0.00 | 1882805.33 | 60.70 | 5.65 | 532.24 | true | 0.606513;0.607028;0.605821;0.606838;0.606801 | 10644816;10644816;10644816;10644816;10644816 | 1881888;1881536;1881312;1884992;1884992 | 0;0;512;0;0 |
60 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 107.333 | 313600 | 411904 | 174683392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 61420096 | 1621.33 | 412138.67 | 3.10 | 148.44 | 2456.80 | false | 0.031227;0.031227;0.031228;0.031227;0.031226 | 61420096;61420096;61420096;61420096;61420096 | 4352;256;7680;256;256 | 411968;412480;411968;412480;411968 |
60 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 107.333 | 313600 | 411904 | 174683392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 98368.00 | 789.33 | 42.90 | 0.00 | 0.00 | true | 0.429104;0.428443;0.429561;0.429585;0.429342 | 0;0;0;0;0 | 98368;98368;98368;98368;98368 | 704;960;704;960;704 |
61 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 106 | 313600 | 411904 | 174996992 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 61420096 | 0.00 | 313952.00 | 3.10 | 195.64 | 2456.80 | false | 0.031229;0.031230;0.031230;0.031230;0.031230 | 61420096;61420096;61420096;61420096;61420096 | 0;0;0;0;0 | 313952;302176;313952;314080;313952 |
61 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 106 | 313600 | 411904 | 174996992 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 98304.00 | 85.33 | 42.90 | 0.00 | 0.00 | true | 0.429589;0.429829;0.430752;0.429039;0.428045 | 0;0;0;0;0 | 128;0;128;0;128 | 98304;98304;98304;98304;98304 |
62 | InceptionV4/InceptionV4/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115.333 | 470528 | 617984 | 175467520 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 92130144 | 0.00 | 517418.67 | 4.50 | 178.06 | 2490.00 | false | 0.045557;0.045303;0.045327;0.044951;0.044995 | 92130144;92130144;92130144;92130144;92130144 | 0;0;0;0;0 | 517536;517408;517408;517408;517440 |
62 | InceptionV4/InceptionV4/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115.333 | 470528 | 617984 | 175467520 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 148309.33 | 512.00 | 43.60 | 0.00 | 0.00 | true | 0.435401;0.436351;0.435123;0.435751;0.437671 | 0;0;0;0;0 | 384;512;512;512;512 | 147456;147456;150016;147456;152832 |
63 | InceptionV4/InceptionV4/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115.333 | 470528 | 617984 | 174056448 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 92130144 | 4352.00 | 511669.33 | 4.50 | 178.54 | 2490.00 | false | 0.044921;0.044905;0.045278;0.045166;0.044945 | 92130144;92130144;92130144;92130144;92130144 | 4352;4352;4352;4096;4352 | 516192;503776;515680;501216;515552 |
63 | InceptionV4/InceptionV4/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115.333 | 470528 | 617984 | 174056448 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147456.00 | 12970.67 | 43.60 | 0.00 | 0.00 | true | 0.434809;0.435496;0.435667;0.437135;0.435986 | 0;0;0;0;0 | 147456;153856;147456;147456;147456 | 12160;17280;12416;13952;12544 |
64 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 25 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 78400 | 256.00 | 45002.67 | 44.40 | 1.73 | 26.13 | true | 0.443353;0.443886;0.444844;0.443866;0.446316 | 78400;78400;78400;78400;78400 | 5632;256;256;256;256 | 52768;46496;42144;44704;43808 |
65 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 19.667 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 78400 | 256.00 | 123701.33 | 44.10 | 0.63 | 26.13 | true | 0.439377;0.441706;0.441953;0.444698;0.439535 | 78400;78400;78400;78400;78400 | 256;256;256;256;256 | 122624;124320;124160;116224;126848 |
66 | InceptionV4/InceptionV4/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 19.667 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 228352.00 | 53.60 | 0.51 | 29.40 | true | 0.535657;0.535452;0.534430;0.538888;0.537788 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 224000;230912;233600;218560;230144 |
67 | InceptionV4/InceptionV4/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 19 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 117600 | 384.00 | 30293.33 | 53.60 | 3.83 | 39.20 | true | 0.535421;0.537209;0.536624;0.538561;0.535432 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 23680;35072;30208;25600;38400 |
68 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 18.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.80 | 0.00 | 0.00 | true | 0.438455;0.438056;0.438420;0.438213;0.438826 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;0;0;0 |
69 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 19.667 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 512.00 | 43.60 | 0.00 | 0.00 | true | 0.435138;0.436712;0.435815;0.436202;0.435985 | 0;0;0;0;0 | 0;0;0;0;0 | 512;512;512;512;512 |
70 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 113.333 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 20.00 | 99993600 | 256.00 | 328021.33 | 12.50 | 304.60 | 4999.68 | false | 0.124621;0.124611;0.124623;0.124633;0.124618 | 99993600;99993600;99993600;99993600;99993600 | 336896;333792;319456;328608;321664 | 256;256;256;256;256 |
70 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 113.333 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 221184.00 | 71157.33 | 44.60 | 0.00 | 0.00 | true | 0.444128;0.443393;0.452361;0.445078;0.447654 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 70048;68096;71808;72928;71616 |
70 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 113.333 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 356352 | 0.00 | 126922.67 | 6.20 | 2.81 | 118.78 | true | 0.062302;0.062304;0.062304;0.062294;0.062294 | 356352;356352;356352;356352;356352 | 130048;129344;125824;125600;124832 | 0;0;0;0;0 |
71 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 112.333 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 20.00 | 99993600 | 0.00 | 285130.67 | 12.50 | 350.69 | 4999.68 | false | 0.124633;0.124626;0.124637;0.124622;0.124610 | 99993600;99993600;99993600;99993600;99993600 | 0;0;0;0;0 | 285952;284032;282880;286816;285408 |
71 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 112.333 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 221184.00 | 4778.67 | 44.40 | 0.00 | 0.00 | true | 0.443958;0.444857;0.444004;0.445763;0.444007 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 2336;4096;5760;5760;4480 |
71 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 112.333 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 356352 | 0.00 | 177632.00 | 6.20 | 2.01 | 118.78 | true | 0.062295;0.062301;0.062293;0.062283;0.062296 | 356352;356352;356352;356352;356352 | 0;0;1536;0;0 | 180640;175520;179488;171744;177888 |
72 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 24.667 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 117600 | 384.00 | 597.33 | 54.60 | 119.84 | 32.07 | false | 0.544584;0.545281;0.546233;0.545964;0.546189 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 384;640;896;768;0 |
73 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 19.333 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 117600 | 384.00 | 298.67 | 53.90 | 172.27 | 39.20 | false | 0.539027;0.538173;0.537580;0.539056;0.538909 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 640;128;0;1024;128 |
74 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 18.333 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 554.67 | 59.90 | 0.00 | 0.00 | true | 0.599455;0.598597;0.590364;0.599391;0.599183 | 0;0;0;0;0 | 0;0;0;0;0 | 384;768;256;768;512 |
75 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 119 | 470528 | 1724416 | 172959232 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 27.00 | 149022720 | 0.00 | 405354.67 | 12.50 | 367.64 | 5519.36 | false | 0.124722;0.124723;0.124734;0.124728;0.124720 | 149022720;149022720;149022720;149022720;149022720 | 406336;405760;402464;406432;403968 | 0;0;0;0;0 |
75 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 119 | 470528 | 1724416 | 172959232 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 331776.00 | 60992.00 | 44.70 | 0.00 | 0.00 | true | 0.447170;0.446149;0.447190;0.446832;0.446959 | 0;0;0;0;0 | 73568;60928;60640;61152;60896 | 331776;331776;331776;331776;331776 |
75 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 119 | 470528 | 1724416 | 172959232 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 534528 | 0.00 | 318368.00 | 6.20 | 1.68 | 178.18 | true | 0.062265;0.062256;0.062257;0.062270;0.062261 | 534528;534528;534528;534528;534528 | 0;0;1536;0;0 | 310816;323648;315040;323488;316576 |
76 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 26.667 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 117600 | 384.00 | 256.00 | 54.60 | 183.75 | 32.07 | false | 0.545874;0.546155;0.546852;0.545882;0.546300 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 384;256;128;256;256 |
78 | InceptionV4/InceptionV4/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 384 35 35]] | 22.667 | 2508288 | 0 | 173114880 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 526656.00 | 64.80 | 0.00 | 0.00 | true | 0.644921;0.647617;0.647197;0.649840;0.651145 | 0;0;0;0;0 | 0;0;0;0;0 | 523744;520544;526976;529280;529248 |
79 | InceptionV4/InceptionV4/Mixed_5d/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 384 35 35]] | 51.333 | 1882112 | 1882112 | 174996992 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 20.00 | 10427196 | 256.00 | 1881365.33 | 61.10 | 5.54 | 521.36 | true | 0.610408;0.611533;0.610054;0.611139;0.611472 | 10427196;10427196;10427196;10427196;10427196 | 256;256;256;256;256 | 1881120;1881408;1886912;1881408;1881280 |
80 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 107.667 | 313600 | 411904 | 175310592 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 61420096 | 256.00 | 412917.33 | 3.10 | 148.65 | 2456.80 | false | 0.031231;0.031229;0.031229;0.031230;0.031229 | 61420096;61420096;61420096;61420096;61420096 | 256;256;256;5632;256 | 413088;413056;412704;412992;405280 |
80 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 107.667 | 313600 | 411904 | 175310592 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 98304.00 | 725.33 | 42.70 | 0.00 | 0.00 | true | 0.427235;0.426621;0.431044;0.426560;0.426994 | 0;0;0;0;0 | 640;640;896;640;8576 | 98304;98304;98304;98304;100864 |
81 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 105 | 313600 | 411904 | 175624192 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 61420096 | 0.00 | 303978.67 | 3.10 | 202.05 | 2456.80 | false | 0.031229;0.031230;0.031229;0.031228;0.031230 | 61420096;61420096;61420096;61420096;61420096 | 0;0;0;0;0 | 303712;293984;304096;304864;304128 |
81 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 105 | 313600 | 411904 | 175624192 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 98304.00 | 512.00 | 42.90 | 0.00 | 0.00 | true | 0.427920;0.428163;0.429364;0.429493;0.429643 | 0;0;0;0;0 | 98304;98304;98304;98304;98304 | 512;11008;512;512;512 |
82 | InceptionV4/InceptionV4/Mixed_5d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116 | 470528 | 617984 | 176094720 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 92130144 | 0.00 | 499872.00 | 4.50 | 184.31 | 2490.00 | false | 0.045671;0.045018;0.044921;0.045336;0.045591 | 92130144;92130144;92130144;92130144;92130144 | 499456;503936;500704;499456;499168 | 0;0;0;0;256 |
82 | InceptionV4/InceptionV4/Mixed_5d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116 | 470528 | 617984 | 176094720 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147456.00 | 3072.00 | 43.30 | 0.00 | 0.00 | true | 0.430690;0.432832;0.432177;0.432566;0.432773 | 0;0;0;0;0 | 147456;147456;147456;148992;147456 | 2944;2560;3200;5504;3072 |
83 | InceptionV4/InceptionV4/Mixed_5d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.333 | 470528 | 617984 | 174056960 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 92130144 | 145066.67 | 520384.00 | 4.50 | 138.45 | 2490.00 | false | 0.044609;0.045960;0.045491;0.045123;0.044905 | 92130144;92130144;92130144;92130144;92130144 | 520864;520960;517600;520960;519328 | 146944;147456;142464;144128;144128 |
83 | InceptionV4/InceptionV4/Mixed_5d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.333 | 470528 | 617984 | 174056960 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147456.00 | 13226.67 | 43.40 | 0.00 | 0.00 | true | 0.430960;0.437947;0.432342;0.433235;0.435824 | 0;0;0;0;0 | 147456;150016;147456;147456;147456 | 14208;12544;13312;12416;13824 |
84 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 25.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 27349.33 | 56053.33 | 44.50 | 0.94 | 19.60 | true | 0.445277;0.446935;0.444870;0.445914;0.444721 | 78400;78400;78400;78400;78400 | 54816;51232;55712;57760;57632 | 28928;28288;26752;27008;24832 |
85 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 19.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 12586.67 | 104736.00 | 44.10 | 0.67 | 19.60 | true | 0.441477;0.441659;0.441894;0.440834;0.440340 | 78400;78400;78400;78400;78400 | 12928;13952;12288;12544;10368 | 104480;104608;105120;104992;104608 |
86 | InceptionV4/InceptionV4/Mixed_5d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 19 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 264672.00 | 53.90 | 0.44 | 29.40 | true | 0.538573;0.538731;0.537358;0.538518;0.538478 | 117600;117600;117600;117600;117600 | 265088;265216;264832;260480;264096 | 384;384;384;7808;384 |
87 | InceptionV4/InceptionV4/Mixed_5d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 19.333 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 117600 | 384.00 | 18261.33 | 53.80 | 6.31 | 39.20 | true | 0.537486;0.537957;0.537950;0.537765;0.537275 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 16768;28160;13952;23168;14848 |
88 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 19.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.438399;0.438558;0.439267;0.438499;0.438639 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;2944 |
89 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 17 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 554.67 | 43.70 | 0.00 | 0.00 | true | 0.437488;0.437023;0.437116;0.436694;0.437200 | 0;0;0;0;0 | 384;512;640;512;640 | 0;0;0;0;0 |
90 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 117 | 470528 | 1882112 | 172645376 | GPU_0_bfc | 1411584 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 20.00 | 99993600 | 256.00 | 668181.33 | 12.50 | 149.59 | 4999.68 | false | 0.124624;0.124620;0.124610;0.124631;0.124632 | 99993600;99993600;99993600;99993600;99993600 | 512;256;256;256;256 | 660160;668224;673856;667584;668736 |
90 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 117 | 470528 | 1882112 | 172645376 | GPU_0_bfc | 1411584 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 221184.00 | 93386.67 | 45.20 | 0.00 | 0.00 | true | 0.452797;0.453182;0.448861;0.452024;0.452112 | 0;0;0;0;0 | 94272;93664;100800;88512;92224 | 221184;221184;221184;221184;223232 |
90 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 117 | 470528 | 1882112 | 172645376 | GPU_0_bfc | 1411584 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.33 | 356352 | 0.00 | 242837.33 | 6.20 | 1.47 | 106.92 | true | 0.062288;0.062293;0.062295;0.062295;0.062304 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 243264;245312;240320;244928;239680 |
91 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 111 | 470528 | 1411584 | 172802304 | GPU_0_bfc | 941056 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 20.00 | 99993600 | 1024.00 | 246528.00 | 12.50 | 403.93 | 4999.68 | false | 0.124629;0.124634;0.124620;0.124614;0.124620 | 99993600;99993600;99993600;99993600;99993600 | 235776;254848;235520;253312;250496 | 0;4608;0;0;3072 |
91 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 111 | 470528 | 1411584 | 172802304 | GPU_0_bfc | 941056 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 221184.00 | 29184.00 | 44.50 | 0.00 | 0.00 | true | 0.450730;0.444210;0.444588;0.445198;0.443405 | 0;0;0;0;0 | 30336;26240;28800;28800;29952 | 221184;221184;221184;221184;221184 |
91 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 111 | 470528 | 1411584 | 172802304 | GPU_0_bfc | 941056 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 356352 | 0.00 | 170613.33 | 6.20 | 2.09 | 118.78 | true | 0.062293;0.062283;0.062279;0.062275;0.062281 | 356352;356352;356352;356352;356352 | 173344;176352;169760;168640;168736 | 0;0;0;0;0 |
92 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 25.333 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 117600 | 384.00 | 0.00 | 54.60 | 306.25 | 35.28 | false | 0.542832;0.546916;0.546600;0.545311;0.546261 | 117600;117600;117600;117600;117600 | 0;0;0;0;0 | 384;384;384;384;384 |
93 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 19.333 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 117600 | 384.00 | 128.00 | 53.80 | 229.69 | 39.20 | false | 0.537587;0.538194;0.539167;0.537881;0.538411 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 128;128;128;0;128 |
94 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 18 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 59.70 | 0.00 | 0.00 | true | 0.598161;0.598319;0.590404;0.597016;0.597163 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
95 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.667 | 470528 | 1863168 | 172959232 | GPU_0_bfc | 1392640 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 27.00 | 149022720 | 0.00 | 410368.00 | 12.50 | 363.14 | 5519.36 | false | 0.124730;0.124732;0.124736;0.124735;0.124731 | 149022720;149022720;149022720;149022720;149022720 | 0;0;0;0;0 | 411456;411360;408512;411232;398848 |
95 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.667 | 470528 | 1863168 | 172959232 | GPU_0_bfc | 1392640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 331776.00 | 86645.33 | 43.90 | 0.00 | 0.00 | true | 0.437566;0.442156;0.438734;0.438152;0.439251 | 0;0;0;0;0 | 331776;334080;331776;331776;331776 | 87712;86304;85920;88992;85536 |
95 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.667 | 470528 | 1863168 | 172959232 | GPU_0_bfc | 1392640 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 534528 | 0.00 | 297376.00 | 6.20 | 1.80 | 178.18 | true | 0.062264;0.062270;0.062268;0.062273;0.062271 | 534528;534528;534528;534528;534528 | 0;0;0;0;0 | 296096;297888;298400;294944;298144 |
96 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 24 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 117600 | 384.00 | 384.00 | 54.60 | 153.12 | 32.07 | false | 0.545027;0.546127;0.545873;0.547125;0.546620 | 117600;117600;117600;117600;117600 | 384;640;384;384;384 | 512;384;256;768;256 |
98 | InceptionV4/InceptionV4/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 384 35 35]] | 21.333 | 2664960 | 0 | 173271552 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 321536.00 | 64.90 | 0.00 | 0.00 | true | 0.640305;0.648185;0.650328;0.647830;0.649553 | 0;0;0;0;0 | 0;0;0;0;0 | 318368;323136;322176;326656;319296 |
99 | InceptionV4/InceptionV4/Mixed_5e/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 384 35 35]] | 50 | 2352640 | 2352640 | 175624192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 20.00 | 10465086 | 256.00 | 1651978.67 | 61.30 | 6.33 | 523.25 | true | 0.613067;0.613518;0.613252;0.613241;0.612125 | 10465086;10465086;10465086;10465086;10465086 | 256;256;256;256;256 | 1650112;1650656;1652064;1653216;1656512 |
100 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 108 | 313600 | 411904 | 175937792 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 61420096 | 256.00 | 412778.67 | 3.10 | 148.70 | 2456.80 | false | 0.031229;0.031229;0.031229;0.031229;0.031228 | 61420096;61420096;61420096;61420096;61420096 | 256;256;256;4864;256 | 412384;412864;412768;413344;412704 |
100 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 108 | 313600 | 411904 | 175937792 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 98304.00 | 682.67 | 43.00 | 0.00 | 0.00 | true | 0.429767;0.429529;0.428846;0.429606;0.429523 | 0;0;0;0;0 | 103680;98304;98304;98304;98304 | 640;768;640;640;768 |
101 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 107 | 313600 | 411904 | 176251392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 61420096 | 0.00 | 290282.67 | 3.10 | 211.59 | 2456.80 | false | 0.031230;0.031230;0.031228;0.031229;0.031229 | 61420096;61420096;61420096;61420096;61420096 | 0;0;0;0;0 | 289952;290816;290560;290336;288832 |
101 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 107 | 313600 | 411904 | 176251392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 98816.00 | 426.67 | 43.00 | 0.00 | 0.00 | true | 0.429291;0.430198;0.430151;0.430023;0.428290 | 0;0;0;0;0 | 99840;99840;98304;98304;98304 | 384;640;384;512;384 |
102 | InceptionV4/InceptionV4/Mixed_5e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 117.667 | 470528 | 617984 | 176721920 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 92130144 | 0.00 | 500384.00 | 4.60 | 184.12 | 2490.00 | false | 0.045061;0.045478;0.045770;0.045453;0.045711 | 92130144;92130144;92130144;92130144;92130144 | 0;0;256;0;0 | 499168;504288;502752;496128;499232 |
102 | InceptionV4/InceptionV4/Mixed_5e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 117.667 | 470528 | 617984 | 176721920 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147456.00 | 22432.00 | 43.50 | 0.00 | 0.00 | true | 0.435150;0.432860;0.439029;0.434200;0.436190 | 0;0;0;0;0 | 147456;147456;147456;148992;147456 | 22240;21344;21632;26112;23424 |
103 | InceptionV4/InceptionV4/Mixed_5e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115 | 470528 | 617984 | 174527488 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 37.00 | 92130144 | 46293.33 | 554058.67 | 4.50 | 153.46 | 2490.00 | false | 0.044928;0.045113;0.045178;0.045359;0.045517 | 92130144;92130144;92130144;92130144;92130144 | 46080;47104;45952;46848;45952 | 570752;547872;552224;553664;556288 |
103 | InceptionV4/InceptionV4/Mixed_5e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115 | 470528 | 617984 | 174527488 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147456.00 | 2816.00 | 43.60 | 0.00 | 0.00 | true | 0.435890;0.436674;0.435649;0.437050;0.436481 | 0;0;0;0;0 | 147456;147456;147456;147456;147456 | 2944;3200;2304;3200;1280 |
104 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 28 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 10154.67 | 9514.67 | 45.00 | 3.99 | 19.60 | true | 0.452474;0.444546;0.452505;0.445231;0.452271 | 78400;78400;78400;78400;78400 | 10240;10240;9984;9856;10368 | 9216;9600;8960;14592;9728 |
105 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 20 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 78400 | 512.00 | 175360.00 | 44.10 | 0.45 | 26.13 | true | 0.442197;0.441036;0.440907;0.440178;0.439597 | 78400;78400;78400;78400;78400 | 512;256;512;512;512 | 173728;174752;178976;168608;177600 |
106 | InceptionV4/InceptionV4/Mixed_5e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 19 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 272170.67 | 53.50 | 0.43 | 29.40 | true | 0.535826;0.534742;0.535518;0.536001;0.532548 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 258688;270592;276224;270208;275712 |
107 | InceptionV4/InceptionV4/Mixed_5e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 19 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 117600 | 384.00 | 48170.67 | 53.70 | 2.42 | 39.20 | true | 0.535856;0.537854;0.537327;0.536559;0.535324 | 117600;117600;117600;117600;117600 | 45824;48768;49536;46208;53376 | 384;384;384;384;5760 |
108 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 18.667 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 85.33 | 341.33 | 43.90 | 0.00 | 0.00 | true | 0.438533;0.438804;0.438624;0.438108;0.438648 | 0;0;0;0;0 | 256;0;4864;0;0 | 384;256;384;256;384 |
109 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 19 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.70 | 0.00 | 0.00 | true | 0.437357;0.436934;0.437441;0.437720;0.436729 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
110 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 113 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 20.00 | 99993600 | 256.00 | 428896.00 | 12.50 | 233.00 | 4999.68 | false | 0.124620;0.124626;0.124627;0.124616;0.124622 | 99993600;99993600;99993600;99993600;99993600 | 512;256;256;256;256 | 434688;429088;433120;424480;414240 |
110 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 113 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 221184.00 | 12245.33 | 44.50 | 0.00 | 0.00 | true | 0.444585;0.443441;0.444417;0.449658;0.445071 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 12672;11776;12288;15488;11264 |
110 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 113 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 356352 | 0.00 | 132640.00 | 6.20 | 2.69 | 118.78 | true | 0.062315;0.062315;0.062312;0.062315;0.062307 | 356352;356352;356352;356352;356352 | 132512;131616;133152;134816;132256 | 0;0;0;0;256 |
111 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 20.00 | 99993600 | 0.00 | 259018.67 | 12.50 | 386.05 | 4999.68 | false | 0.124627;0.124626;0.124622;0.124617;0.124616 | 99993600;99993600;99993600;99993600;99993600 | 260256;258720;259872;258464;257408 | 0;0;0;0;0 |
111 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 221184.00 | 9173.33 | 44.80 | 0.00 | 0.00 | true | 0.449940;0.445659;0.445197;0.450587;0.449117 | 0;0;0;0;0 | 221440;221184;221184;221184;221184 | 9472;8448;9600;10624;8064 |
111 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 356352 | 0.00 | 184096.00 | 6.20 | 1.94 | 118.78 | true | 0.062277;0.062292;0.062283;0.062280;0.062282 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 185216;183520;183680;181600;185088 |
112 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 24.333 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 117600 | 384.00 | 1920.00 | 54.60 | 51.04 | 39.20 | false | 0.545318;0.545075;0.546412;0.546260;0.546241 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 256;2176;1280;3456;2304 |
113 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 19 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 117600 | 384.00 | 1920.00 | 53.70 | 51.04 | 39.20 | false | 0.535798;0.537688;0.536697;0.538443;0.535963 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 384;2048;1024;2944;2688 |
114 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 18.667 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 42.67 | 59.80 | 0.00 | 0.00 | true | 0.597494;0.598798;0.598216;0.597820;0.597050 | 0;0;0;0;0 | 0;256;0;0;0 | 0;128;0;128;0 |
115 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.333 | 470528 | 1880576 | 172959232 | GPU_0_bfc | 1410048 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 27.00 | 149022720 | 0.00 | 412885.33 | 12.50 | 360.93 | 5519.36 | false | 0.124725;0.124729;0.124732;0.124733;0.124729 | 149022720;149022720;149022720;149022720;149022720 | 256;0;0;0;0 | 413120;414848;412256;413280;409376 |
115 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.333 | 470528 | 1880576 | 172959232 | GPU_0_bfc | 1410048 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 331840.00 | 98506.67 | 43.50 | 0.00 | 0.00 | true | 0.434577;0.435479;0.436453;0.434118;0.436255 | 0;0;0;0;0 | 331840;331840;331840;331840;331840 | 97344;97184;99424;101248;98752 |
115 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.333 | 470528 | 1880576 | 172959232 | GPU_0_bfc | 1410048 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.00 | 534528 | 0.00 | 201632.00 | 6.20 | 2.65 | 178.18 | true | 0.062268;0.062287;0.062265;0.062266;0.062289 | 534528;534528;534528;534528;534528 | 204672;193056;202016;198208;208544 | 0;0;0;0;0 |
116 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 24 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 1333.33 | 54.60 | 68.48 | 29.40 | false | 0.545621;0.543779;0.545930;0.547441;0.545366 | 117600;117600;117600;117600;117600 | 2944;384;384;384;384 | 1056;1408;1536;2688;256 |
118 | InceptionV4/InceptionV4/Mixed_5e/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 384 35 35]] | 22.333 | 2352640 | 0 | 172959232 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 408437.33 | 65.00 | 0.00 | 0.00 | true | 0.651374;0.652034;0.648104;0.649638;0.649266 | 0;0;0;0;0 | 0;0;0;0;0 | 407840;407200;409248;408224;412320 |
119 | InceptionV4/InceptionV4/Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 384 17 17]] | 36.667 | 443904 | 443904 | 173403136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 6.00 | 110976 | 1792.00 | 444224.00 | 43.20 | 0.25 | 18.50 | true | 0.435059;0.433417;0.431140;0.429064;0.432829 | 110976;110976;110976;110976;110976 | 2048;1792;1792;1792;1792 | 444000;444576;444768;444096;444000 |
120 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 35 35]] | 123.667 | 940800 | 1235712 | 174343936 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 40.00 | 184260288 | 256.00 | 1056330.67 | 7.70 | 174.39 | 4606.51 | false | 0.077467;0.077282;0.077506;0.077040;0.076393 | 184260288;184260288;184260288;184260288;184260288 | 256;256;256;256;256 | 1051200;1060384;1048064;1057504;1060288 |
120 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 35 35]] | 123.667 | 940800 | 1235712 | 174343936 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 294912.00 | 47573.33 | 45.80 | 0.00 | 0.00 | true | 0.452530;0.464398;0.461593;0.461318;0.450322 | 0;0;0;0;0 | 294912;294912;294912;294912;294912 | 46592;47488;47744;49152;47488 |
121 | InceptionV4/InceptionV4/Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 386.667 | 443904 | 5752320 | 174787840 | GPU_0_bfc | 5308416 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 276.33 | 849457536 | 5854314.67 | 2111765.33 | 4.70 | 106.63 | 3074.04 | false | 0.047169;0.047108;0.047200;0.047326;0.047178 | 849457536;849457536;849457536;849457536;849457536 | 5805888;5783616;5869632;5926144;5887424 | 2111680;2121792;2135488;2089888;2101824 |
121 | InceptionV4/InceptionV4/Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 386.667 | 443904 | 5752320 | 174787840 | GPU_0_bfc | 5308416 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 32.67 | 0 | 5377258.67 | 5417898.67 | 46.80 | 0.00 | 0.00 | true | 0.464811;0.469477;0.466367;0.469885;0.467520 | 0;0;0;0;0 | 5358400;5341504;5381248;5424320;5392128 | 5384704;5350720;5416192;5488160;5452800 |
122 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 35 35]] | 27 | 940800 | 0 | 172435200 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 235200 | 941888.00 | 176042.67 | 57.30 | 0.21 | 47.04 | true | 0.571561;0.569668;0.572346;0.573705;0.620231 | 235200;235200;235200;235200;235200 | 941888;941888;941888;941888;941888 | 184832;189568;171264;169984;172032 |
123 | InceptionV4/InceptionV4/Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 21.667 | 443904 | 0 | 172435200 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1728.00 | 2261.33 | 53.70 | 27.82 | 27.74 | true | 0.536422;0.535807;0.538924;0.537186;0.537532 | 110976;110976;110976;110976;110976 | 1728;1728;1728;1728;3776 | 3072;4480;2304;1280;1408 |
124 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 35 35]] | 18.667 | 940800 | 0 | 172435200 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 256.00 | 65.20 | 0.00 | 0.00 | true | 0.651489;0.651720;0.651785;0.654466;0.653530 | 0;0;0;0;0 | 0;0;0;0;0 | 256;256;256;0;1280 |
125 | InceptionV4/InceptionV4/Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 384 17 17]] | 19 | 443904 | 0 | 172435200 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 85.33 | 57.50 | 0.00 | 0.00 | true | 0.576158;0.575135;0.575794;0.573819;0.575051 | 0;0;0;0;0 | 0;0;0;0;0 | 256;0;0;0;1664 |
126 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 35 35]] | 245.333 | 1097728 | 2646016 | 173532928 | GPU_0_bfc | 1548288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 158.00 | 966406112 | 25493.33 | 1115381.33 | 9.60 | 847.07 | 6116.49 | false | 0.096406;0.096064;0.095800;0.096093;0.096452 | 966406112;966406112;966406112;966406112;966406112 | 1119328;1138336;1103712;1101248;1123104 | 22240;35520;18080;19936;34304 |
126 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 35 35]] | 245.333 | 1097728 | 2646016 | 173532928 | GPU_0_bfc | 1548288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1548288.00 | 481920.00 | 44.90 | 0.00 | 0.00 | true | 0.450014;0.447175;0.449907;0.452474;0.444738 | 0;0;0;0;0 | 1548288;1548288;1548288;1548288;1548288 | 507264;524800;471040;453696;467456 |
127 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 224 35 35]] | 24 | 1097728 | 0 | 172592128 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 274400 | 896.00 | 57834.67 | 54.70 | 4.67 | 68.60 | true | 0.547206;0.546714;0.548240;0.545628;0.543418 | 274400;274400;274400;274400;274400 | 56736;51552;62336;61152;55616 | 896;896;896;896;896 |
128 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu | Relu | [[1 224 35 35]] | 20 | 1097728 | 0 | 172592128 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 1002.67 | 62.90 | 0.00 | 0.00 | true | 0.617217;0.630192;0.629319;0.629278;0.628203 | 0;0;0;0;0 | 0;0;0;0;0 | 1216;832;960;960;1088 |
129 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 199 | 295936 | 2360320 | 172888064 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 109.67 | 330375424 | 42.67 | 359882.67 | 3.10 | 917.90 | 3012.53 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 330375424;330375424;330375424;330375424;330375424 | 128;0;128;0;0 | 361184;375072;355200;359744;358720 |
129 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 199 | 295936 | 2360320 | 172888064 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.00 | 0 | 2064384.00 | 1074826.67 | 44.70 | 0.00 | 0.00 | true | 0.447712;0.448125;0.447822;0.445996;0.446657 | 0;0;0;0;0 | 2064384;2066432;2064384;2064384;2064384 | 1084576;1060576;1092224;1063104;1076800 |
130 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 24.333 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 73984 | 1024.00 | 24810.67 | 44.30 | 2.86 | 24.66 | true | 0.443214;0.444140;0.442478;0.443107;0.443815 | 73984;73984;73984;73984;73984 | 26528;24864;24992;24576;24192 | 1024;1024;1024;1024;1024 |
131 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu | Relu | [[1 256 17 17]] | 19.333 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 853.33 | 43.80 | 0.00 | 0.00 | true | 0.437751;0.438353;0.437890;0.437354;0.437492 | 0;0;0;0;0 | 0;0;0;0;0 | 896;768;896;768;896 |
133 | InceptionV4/InceptionV4/Mixed_6b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 46.333 | 1183744 | 1183744 | 174142976 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 14.00 | 7044412 | 256.00 | 985013.33 | 53.30 | 7.15 | 503.17 | true | 0.533274;0.532143;0.532561;0.532509;0.534156 | 7044412;7044412;7044412;7044412;7044412 | 256;256;256;256;256 | 975616;991008;989408;979424;986208 |
134 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 143.333 | 221952 | 1008384 | 174364928 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 386474.67 | 3.10 | 325.73 | 2170.42 | false | 0.031246;0.031246;0.031245;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 394688;388800;385760;369152;384864 |
134 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 143.333 | 221952 | 1008384 | 174364928 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 533408.00 | 43.60 | 0.00 | 0.00 | true | 0.439194;0.440128;0.433379;0.434166;0.432258 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 524128;532448;533664;551264;534112 |
135 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141.333 | 221952 | 1008384 | 174586880 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 172992.00 | 3.10 | 727.69 | 2170.42 | false | 0.031246;0.031246;0.031246;0.031247;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;256 | 180416;169408;176192;173376;166720 |
135 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141.333 | 221952 | 1008384 | 174586880 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 85205.33 | 44.30 | 0.00 | 0.00 | true | 0.440403;0.441136;0.447435;0.446055;0.435607 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 82048;81152;85632;87936;90240 |
136 | InceptionV4/InceptionV4/Mixed_6b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 174.333 | 443904 | 2016768 | 175030784 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 86.33 | 251769216 | 1920.00 | 473461.33 | 4.70 | 529.62 | 2916.26 | false | 0.047528;0.047455;0.047602;0.047360;0.047486 | 251769216;251769216;251769216;251769216;251769216 | 2560;2176;1664;1920;1408 | 493216;484320;468224;467840;453184 |
136 | InceptionV4/InceptionV4/Mixed_6b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 174.333 | 443904 | 2016768 | 175030784 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 1254154.67 | 44.50 | 0.00 | 0.00 | true | 0.443456;0.443908;0.447269;0.447598;0.442704 | 0;0;0;0;0 | 1244480;1242720;1256640;1261344;1270752 | 1578240;1572864;1572864;1572864;1572864 |
137 | InceptionV4/InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 157.333 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 83923072 | 1005269.33 | 240778.67 | 3.10 | 67.35 | 1104.25 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 1003264;1003904;1005568;1007232;1006336 | 250304;245568;229600;234944;241824 |
137 | InceptionV4/InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 157.333 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 524288.00 | 296416.00 | 42.90 | 0.00 | 0.00 | true | 0.433624;0.427133;0.432547;0.423241;0.426223 | 0;0;0;0;0 | 524288;524288;524288;526336;524288 | 280960;291232;313152;302528;295488 |
138 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 25.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 36224.00 | 16245.33 | 45.40 | 1.06 | 13.87 | true | 0.454135;0.455183;0.453932;0.453771;0.454401 | 55488;55488;55488;55488;55488 | 33024;34816;35840;38016;38272 | 16032;16928;15648;16288;16416 |
139 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 19.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 119680.00 | 80213.33 | 44.60 | 0.28 | 13.87 | true | 0.445540;0.446625;0.446910;0.445750;0.447650 | 55488;55488;55488;55488;55488 | 119680;119680;119680;127232;119680 | 90112;80000;79744;80384;80256 |
140 | InceptionV4/InceptionV4/Mixed_6b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 19.667 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 252320.00 | 51.80 | 0.44 | 27.74 | true | 0.518468;0.519665;0.519034;0.517700;0.516598 | 110976;110976;110976;110976;110976 | 1536;1536;1536;1536;1536 | 252448;253216;249248;251296;256800 |
141 | InceptionV4/InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 18.333 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 36992 | 512.00 | 41898.67 | 43.50 | 0.87 | 12.33 | true | 0.435503;0.434984;0.435347;0.435630;0.434057 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 47744;38784;36352;40448;46464 |
142 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 63445.33 | 43.90 | 0.00 | 0.00 | true | 0.437979;0.438318;0.438653;0.438701;0.438667 | 0;0;0;0;0 | 0;0;0;0;0 | 62592;65408;63104;62208;64640 |
143 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 17.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 128.00 | 43.70 | 0.00 | 0.00 | true | 0.436942;0.436942;0.436891;0.436486;0.437026 | 0;0;0;0;0 | 0;0;0;0;0 | 384;128;0;128;128 |
144 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 212.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 38.67 | 244930560 | 183840.00 | 1153877.33 | 8.40 | 183.10 | 6334.36 | false | 0.084185;0.084123;0.083970;0.083955;0.084126 | 244930560;244930560;244930560;244930560;244930560 | 183200;181600;186720;179552;189280 | 1136864;1156480;1156288;1148864;1157504 |
144 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 212.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 18.00 | 13123584 | 20458.67 | 3767136.00 | 20.00 | 3.46 | 729.09 | true | 0.197046;0.200903;0.200012;0.200872;0.197924 | 13123584;13123584;13123584;13123584;13123584 | 3769920;3772928;3745952;3778976;3758560 | 20288;20864;20288;20800;20288 |
144 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 212.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 233461.33 | 665792.00 | 2.00 | 1.29 | 77.47 | true | 0.019895;0.019895;0.019897;0.019895;0.019896 | 1161984;1161984;1161984;1161984;1161984 | 233248;233376;233632;233504;233504 | 687360;669376;663456;663616;664384 |
144 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 212.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 294080.00 | 44.00 | 0.00 | 0.00 | true | 0.439165;0.441012;0.440397;0.441790;0.439652 | 0;0;0;0;0 | 300608;288448;291104;293344;297792 | 1032192;1032192;1032192;1032192;1032192 |
144 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 212.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 8.00 | 1090176 | 19936.00 | 333536.00 | 2.00 | 3.08 | 136.27 | true | 0.019870;0.019868;0.019871;0.019872;0.019871 | 1090176;1090176;1090176;1090176;1090176 | 19392;19008;19936;22016;20480 | 338336;335936;336256;328416;323744 |
145 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 203.667 | 259072 | 8274688 | 171901440 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.00 | 285752320 | 2050037.33 | 1648704.00 | 10.00 | 77.26 | 8404.48 | false | 0.099916;0.099934;0.099740;0.099866;0.099882 | 285752320;285752320;285752320;285752320;285752320 | 1675520;1646912;1641152;1623808;1658048 | 2031776;2065824;1984416;2081952;2052512 |
145 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 203.667 | 259072 | 8274688 | 171901440 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 19.67 | 1161984 | 243872.00 | 629365.33 | 2.00 | 1.33 | 59.08 | true | 0.019902;0.019903;0.019902;0.019902;0.019903 | 1161984;1161984;1161984;1161984;1161984 | 243872;243872;243872;243872;243872 | 649920;645696;622144;620256;618592 |
145 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 203.667 | 259072 | 8274688 | 171901440 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 9141.33 | 4957344.00 | 18.60 | 3.08 | 956.93 | true | 0.184961;0.186402;0.185835;0.185715;0.185578 | 15310848;15310848;15310848;15310848;15310848 | 7776;10464;7648;9184;10592 | 4946624;4917056;4959712;4965696;4990176 |
145 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 203.667 | 259072 | 8274688 | 171901440 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1206037.33 | 1056170.67 | 45.10 | 0.00 | 0.00 | true | 0.450734;0.449668;0.450206;0.450940;0.456252 | 0;0;0;0;0 | 1004096;1062336;1071936;1076000;1034240 | 1208768;1204672;1216192;1204672;1204672 |
145 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 203.667 | 259072 | 8274688 | 171901440 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 10.00 | 1271872 | 12960.00 | 90400.00 | 2.30 | 12.31 | 127.19 | true | 0.023156;0.023156;0.023152;0.023155;0.023151 | 1271872;1271872;1271872;1271872;1271872 | 12960;12960;12800;13056;12960 | 91328;90368;87648;91072;89760 |
146 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 26 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 226282.67 | 38154.67 | 46.70 | 0.21 | 13.87 | true | 0.467165;0.465905;0.466079;0.473117;0.466680 | 55488;55488;55488;55488;55488 | 226112;226624;226112;226624;226112 | 39680;36352;44160;37408;37376 |
147 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 21 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 1066.67 | 45.30 | 28.36 | 16.18 | false | 0.454249;0.454044;0.453842;0.452591;0.452322 | 64736;64736;64736;64736;64736 | 1216;1216;1216;6592;1216 | 512;5248;512;768;1920 |
148 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 18.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 1024.00 | 54741.33 | 44.00 | 0.00 | 0.00 | true | 0.439450;0.440298;0.440252;0.439758;0.439894 | 0;0;0;0;0 | 1024;1024;1024;2304;1024 | 55328;54240;54656;57216;53600 |
149 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 17.333 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 640.00 | 43.80 | 0.00 | 0.00 | true | 0.438236;0.437713;0.438152;0.437628;0.437452 | 0;0;0;0;0 | 0;0;0;0;1536 | 0;1152;384;1920;384 |
150 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.333 | 259072 | 8274688 | 171938560 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.00 | 285752320 | 1563498.67 | 1713205.33 | 10.00 | 87.21 | 8404.48 | false | 0.100205;0.100196;0.099942;0.100141;0.099726 | 285752320;285752320;285752320;285752320;285752320 | 1548288;1603712;1562272;1579936;1477760 | 1752960;1727424;1701248;1710944;1699680 |
150 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.333 | 259072 | 8274688 | 171938560 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 5440.00 | 5069248.00 | 18.20 | 3.02 | 956.93 | true | 0.184090;0.180597;0.182522;0.182781;0.180626 | 15310848;15310848;15310848;15310848;15310848 | 6976;6080;4288;3904;5952 | 5036448;5068640;5072256;5066848;5099424 |
150 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.333 | 259072 | 8274688 | 171938560 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 1161984 | 223136.00 | 614410.67 | 2.00 | 1.39 | 72.62 | true | 0.019969;0.019953;0.019982;0.019975;0.019969 | 1161984;1161984;1161984;1161984;1161984 | 223136;223392;223136;223136;223136 | 625056;612128;608992;622112;597024 |
150 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.333 | 259072 | 8274688 | 171938560 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204224.00 | 886165.33 | 44.90 | 0.00 | 0.00 | true | 0.446771;0.450312;0.446840;0.453068;0.449794 | 0;0;0;0;0 | 1204224;1207296;1204224;1204224;1204224 | 872640;863648;897216;896992;888864 |
150 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.333 | 259072 | 8274688 | 171938560 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 1271872 | 1269.33 | 72426.67 | 2.30 | 17.26 | 317.97 | true | 0.022990;0.022991;0.022990;0.022988;0.023001 | 1271872;1271872;1271872;1271872;1271872 | 69728;71648;76512;69664;75904 | 1312;1184;1312;8480;1056 |
151 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 208.667 | 295936 | 12001280 | 172012544 | GPU_0_bfc | 11705344 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 41.00 | 380608512 | 6941002.67 | 2714645.33 | 11.00 | 39.42 | 9283.13 | false | 0.110255;0.110188;0.110498;0.110266;0.110280 | 380608512;380608512;380608512;380608512;380608512 | 2723808;2731104;2717408;2675712;2702720 | 7042976;6979616;6924448;6918944;6887072 |
151 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 208.667 | 295936 | 12001280 | 172012544 | GPU_0_bfc | 11705344 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 27.00 | 20414464 | 85813.33 | 7087008.00 | 19.50 | 2.85 | 756.09 | true | 0.192808;0.196941;0.196343;0.195840;0.193989 | 20414464;20414464;20414464;20414464;20414464 | 89184;85856;82016;87264;84320 | 7136320;7065696;7060992;7100608;7094720 |
151 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 208.667 | 295936 | 12001280 | 172012544 | GPU_0_bfc | 11705344 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605824.00 | 1108714.67 | 44.90 | 0.00 | 0.00 | true | 0.449113;0.448692;0.450405;0.446952;0.447907 | 0;0;0;0;0 | 1605824;1605824;1605824;1605824;1605824 | 1090048;1106496;1121152;1116992;1102656 |
151 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 208.667 | 295936 | 12001280 | 172012544 | GPU_0_bfc | 11705344 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 628810.67 | 2.30 | 1.53 | 123.24 | true | 0.023156;0.023190;0.023192;0.023206;0.023193 | 1355648;1355648;1355648;1355648;1355648 | 601120;632384;633440;634048;620608 | 259360;259360;259360;259360;259360 |
151 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 208.667 | 295936 | 12001280 | 172012544 | GPU_0_bfc | 11705344 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 7.00 | 1453568 | 3232.00 | 35498.67 | 2.60 | 37.53 | 207.65 | false | 0.026243;0.026195;0.026213;0.026204;0.026203 | 1453568;1453568;1453568;1453568;1453568 | 3072;3488;3232;3232;3232 | 36096;34944;35456;33664;36608 |
152 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 26.333 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 260160.00 | 22741.33 | 46.10 | 0.23 | 16.18 | true | 0.460308;0.464291;0.461152;0.459152;0.462770 | 64736;64736;64736;64736;64736 | 260160;260160;260160;260160;260160 | 18944;20480;23936;23808;25216 |
153 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 19.667 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1344.00 | 298.67 | 44.90 | 45.04 | 18.50 | false | 0.449950;0.449559;0.448153;0.449219;0.447773 | 73984;73984;73984;73984;73984 | 256;128;512;256;384 | 1344;1344;1344;1344;1344 |
154 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 19 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 768.00 | 170.67 | 44.00 | 0.00 | 0.00 | true | 0.439612;0.439866;0.439785;0.439637;0.440639 | 0;0;0;0;0 | 768;768;768;768;768 | 0;1280;0;128;384 |
155 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193.333 | 481024 | 11137536 | 172234496 | GPU_0_bfc | 10656512 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 40.00 | 333032448 | 5059200.00 | 2171584.00 | 9.90 | 46.06 | 8325.81 | false | 0.099124;0.099069;0.099288;0.098983;0.098986 | 333032448;333032448;333032448;333032448;333032448 | 4947360;5080704;5025536;5136128;5071360 | 2227872;2188480;2133760;2163392;2162880 |
155 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193.333 | 481024 | 11137536 | 172234496 | GPU_0_bfc | 10656512 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 17.33 | 17862656 | 28053.33 | 5964181.33 | 20.10 | 2.98 | 1030.56 | true | 0.201673;0.200357;0.201690;0.199237;0.201996 | 17862656;17862656;17862656;17862656;17862656 | 27072;26816;30272;32448;25408 | 5904896;5955456;6006176;5970624;5966464 |
155 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193.333 | 481024 | 11137536 | 172234496 | GPU_0_bfc | 10656512 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1406037.33 | 478613.33 | 44.60 | 0.00 | 0.00 | true | 0.448257;0.444596;0.443981;0.452800;0.445094 | 0;0;0;0;0 | 1408256;1404928;1404928;1404928;1409024 | 468896;467360;484320;491936;482624 |
155 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193.333 | 481024 | 11137536 | 172234496 | GPU_0_bfc | 10656512 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 795008.00 | 2.30 | 1.29 | 123.24 | true | 0.023191;0.023164;0.023167;0.023151;0.023172 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259360 | 823904;805792;770240;789152;790080 |
155 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193.333 | 481024 | 11137536 | 172234496 | GPU_0_bfc | 10656512 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 1271872 | 1482.67 | 41866.67 | 2.30 | 29.34 | 317.97 | false | 0.022952;0.022948;0.022953;0.022951;0.022948 | 1271872;1271872;1271872;1271872;1271872 | 1056;1440;1184;1824;1856 | 44320;42112;40768;39744;42720 |
156 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 25.667 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 384.00 | 45.70 | 40.46 | 16.18 | false | 0.456816;0.456940;0.458972;0.457006;0.457952 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 384;384;384;800;384 |
157 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 18.333 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 437.33 | 256.00 | 44.90 | 0.00 | 0.00 | true | 0.448622;0.449265;0.448596;0.448873;0.449204 | 0;0;0;0;0 | 608;352;1888;352;352 | 256;256;256;256;256 |
158 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 181.333 | 295936 | 2796288 | 172271360 | GPU_0_bfc | 2500352 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 88.33 | 256975104 | 10602.67 | 155008.00 | 3.10 | 1551.68 | 2909.16 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 172800;163360;146336;148160;153504 | 10432;10688;10432;10688;10944 |
158 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 181.333 | 295936 | 2796288 | 172271360 | GPU_0_bfc | 2500352 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605632.00 | 815648.00 | 44.90 | 0.00 | 0.00 | true | 0.450675;0.447859;0.450797;0.448460;0.449199 | 0;0;0;0;0 | 1605632;1605632;1605632;1605632;1605632 | 764128;781856;832960;840416;832128 |
159 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 24 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 73984 | 1280.00 | 15957.33 | 44.40 | 4.29 | 22.20 | true | 0.444150;0.443920;0.443264;0.443574;0.443377 | 73984;73984;73984;73984;73984 | 1280;1536;1280;1280;1280 | 13472;17536;15136;19328;15200 |
161 | InceptionV4/InceptionV4/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 21.667 | 1183744 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 41824.00 | 61.50 | 0.00 | 0.00 | true | 0.616025;0.615896;0.614604;0.614610;0.603624 | 0;0;0;0;0 | 256;256;256;256;1792 | 46688;41056;41056;40064;43360 |
162 | InceptionV4/InceptionV4/Mixed_6c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 46.667 | 1479936 | 1479936 | 173270272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 15.00 | 5985412 | 7488.00 | 752693.33 | 52.70 | 7.87 | 399.03 | true | 0.526902;0.526283;0.528108;0.527101;0.526586 | 5985412;5985412;5985412;5985412;5985412 | 7552;7232;7680;7168;7680 | 754944;756000;747776;750080;753056 |
163 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 147.333 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 482517.33 | 3.10 | 260.89 | 2170.42 | false | 0.031246;0.031246;0.031246;0.031245;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 480608;479232;476576;492192;487712 |
163 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 147.333 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786688.00 | 511093.33 | 44.00 | 0.00 | 0.00 | true | 0.441430;0.440945;0.439988;0.434597;0.438261 | 0;0;0;0;0 | 509344;516032;518720;503072;507904 | 786688;791808;786688;786432;786688 |
164 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 140.667 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 115488.00 | 3.10 | 1090.02 | 2170.42 | false | 0.031246;0.031245;0.031245;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;1024;0 | 119072;114400;117344;112992;114720 |
164 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 140.667 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786944.00 | 107274.67 | 43.70 | 0.00 | 0.00 | true | 0.436749;0.438886;0.434750;0.432744;0.440787 | 0;0;0;0;0 | 787968;786432;786432;787968;786432 | 103680;107936;105728;109056;108160 |
165 | InceptionV4/InceptionV4/Mixed_6c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 174 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 86.00 | 251769216 | 2474.67 | 364874.67 | 4.70 | 685.37 | 2927.55 | false | 0.047353;0.047576;0.047340;0.047465;0.047451 | 251769216;251769216;251769216;251769216;251769216 | 5888;2624;1664;2880;1920 | 368704;376288;364672;352544;361248 |
165 | InceptionV4/InceptionV4/Mixed_6c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 174 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.33 | 0 | 1572928.00 | 1187093.33 | 45.40 | 0.00 | 0.00 | true | 0.463085;0.440480;0.451959;0.457350;0.451729 | 0;0;0;0;0 | 1572928;1572928;1572928;1572928;1572928 | 1180928;1174816;1187744;1197760;1192608 |
166 | InceptionV4/InceptionV4/Mixed_6c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152.333 | 147968 | 1183744 | 173122304 | GPU_0_bfc | 1035776 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 74.33 | 83923072 | 784768.00 | 239562.67 | 3.10 | 81.93 | 1129.02 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 783488;788480;781952;786816;784000 | 244160;232704;237088;240448;241152 |
166 | InceptionV4/InceptionV4/Mixed_6c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152.333 | 147968 | 1183744 | 173122304 | GPU_0_bfc | 1035776 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 524288.00 | 282144.00 | 43.20 | 0.00 | 0.00 | true | 0.430688;0.430308;0.433548;0.431302;0.432950 | 0;0;0;0;0 | 524288;524288;524288;524288;524288 | 281024;288512;285760;278144;279648 |
167 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 25.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 222976.00 | 6186.67 | 46.10 | 0.24 | 13.87 | true | 0.458843;0.467665;0.457079;0.457033;0.465953 | 55488;55488;55488;55488;55488 | 222848;222976;222976;222976;222976 | 10624;6144;6144;6272;6144 |
168 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 19.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 207872.00 | 44053.33 | 44.80 | 0.22 | 13.87 | true | 0.450120;0.446769;0.448839;0.445533;0.446982 | 55488;55488;55488;55488;55488 | 207872;207872;207872;207872;209920 | 38016;45056;45856;45728;41376 |
169 | InceptionV4/InceptionV4/Mixed_6c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 20.333 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 345152.00 | 51.90 | 0.32 | 27.74 | true | 0.518374;0.518503;0.518787;0.523669;0.517877 | 110976;110976;110976;110976;110976 | 341664;346912;347648;346880;340096 | 1536;1536;1536;1536;1536 |
170 | InceptionV4/InceptionV4/Mixed_6c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 18.333 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 36992 | 512.00 | 18666.67 | 43.50 | 1.93 | 12.33 | true | 0.434285;0.441335;0.435145;0.434456;0.435046 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 24704;17792;20128;14880;18080 |
171 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 19 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 43477.33 | 43.90 | 0.00 | 0.00 | true | 0.439075;0.439086;0.439609;0.439059;0.439643 | 0;0;0;0;0 | 0;0;0;0;0 | 43392;43648;35584;43776;43392 |
172 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 18 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 896.00 | 43.70 | 0.00 | 0.00 | true | 0.436874;0.437058;0.437429;0.437028;0.436832 | 0;0;0;0;0 | 0;0;0;0;0 | 768;1024;896;1024;768 |
173 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.333 | 221952 | 7603712 | 171864320 | GPU_0_bfc | 7381760 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 33.00 | 244930560 | 11082.67 | 1026698.67 | 8.70 | 236.01 | 7422.14 | false | 0.086591;0.086595;0.086706;0.086613;0.086563 | 244930560;244930560;244930560;244930560;244930560 | 1070336;1062496;1017408;996800;1000192 | 8544;11680;10400;11168;11808 |
173 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.333 | 221952 | 7603712 | 171864320 | GPU_0_bfc | 7381760 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 2090.67 | 4201461.33 | 17.10 | 3.12 | 937.40 | true | 0.170095;0.170621;0.171224;0.170115;0.170794 | 13123584;13123584;13123584;13123584;13123584 | 1984;2880;2048;1792;2240 | 4107616;4138752;4213824;4251808;4258848 |
173 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.333 | 221952 | 7603712 | 171864320 | GPU_0_bfc | 7381760 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 222282.67 | 737824.00 | 2.00 | 1.21 | 105.63 | true | 0.020013;0.019977;0.020011;0.019980;0.019996 | 1161984;1161984;1161984;1161984;1161984 | 222240;222240;222368;222368;222240 | 787136;777792;719552;716128;697696 |
173 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.333 | 221952 | 7603712 | 171864320 | GPU_0_bfc | 7381760 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 971392.00 | 45.00 | 0.00 | 0.00 | true | 0.452083;0.451652;0.447140;0.449563;0.448942 | 0;0;0;0;0 | 966752;957024;977760;969824;977600 | 1032192;1032192;1032192;1032192;1032192 |
173 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.333 | 221952 | 7603712 | 171864320 | GPU_0_bfc | 7381760 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1090176 | 1258.67 | 257824.00 | 2.00 | 4.21 | 218.04 | true | 0.019930;0.019928;0.019928;0.019926;0.019922 | 1090176;1090176;1090176;1090176;1090176 | 128;4864;3136;256;384 | 258208;260256;257920;257056;257344 |
174 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 259072 | 8550400 | 171901440 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.67 | 285752320 | 2211808.00 | 1833194.67 | 10.00 | 70.64 | 8242.78 | false | 0.099502;0.099500;0.099696;0.099520;0.099994 | 285752320;285752320;285752320;285752320;285752320 | 2263776;2184224;2187424;2278752;2060320 | 1861984;1859488;1837856;1802240;1788736 |
174 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 259072 | 8550400 | 171901440 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 4618.67 | 4941226.67 | 18.60 | 3.10 | 956.93 | true | 0.184850;0.185235;0.186534;0.188542;0.184327 | 15310848;15310848;15310848;15310848;15310848 | 4768;5024;4448;4640;3488 | 4928864;4884672;4940896;5032832;4953920 |
174 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 259072 | 8550400 | 171901440 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 1161984 | 228426.67 | 590272.00 | 2.00 | 1.42 | 72.62 | true | 0.019943;0.019922;0.019954;0.019953;0.019930 | 1161984;1161984;1161984;1161984;1161984 | 228256;228512;228512;228256;228512 | 613152;593344;593440;571232;584032 |
174 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 259072 | 8550400 | 171901440 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204416.00 | 1044554.67 | 45.30 | 0.00 | 0.00 | true | 0.452920;0.447724;0.451733;0.452964;0.456920 | 0;0;0;0;0 | 1034944;1043392;1044544;1045728;1047200 | 1204416;1204416;1204416;1204416;1204416 |
174 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 259072 | 8550400 | 171901440 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 8.00 | 1271872 | 6730.67 | 157578.67 | 2.30 | 7.74 | 158.98 | true | 0.023029;0.023028;0.023024;0.023026;0.023049 | 1271872;1271872;1271872;1271872;1271872 | 157728;159200;155808;153280;175808 | 6560;6400;7040;6592;7040 |
175 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 26.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 223296.00 | 16960.00 | 45.80 | 0.23 | 13.87 | true | 0.457848;0.458733;0.457734;0.458054;0.456798 | 55488;55488;55488;55488;55488 | 223296;223296;224832;223296;223296 | 17280;17440;17664;16032;16160 |
176 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 20 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 71146.67 | 45.30 | 0.89 | 16.18 | true | 0.454148;0.452953;0.452485;0.453479;0.452087 | 64736;64736;64736;64736;64736 | 71328;70848;71168;73792;70944 | 1216;1216;1216;1216;1216 |
177 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 20.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 768.00 | 597.33 | 43.90 | 0.00 | 0.00 | true | 0.439477;0.438888;0.439710;0.439109;0.439469 | 0;0;0;0;0 | 768;1792;768;768;768 | 512;384;640;640;640 |
178 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 17.667 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 1365.33 | 43.80 | 0.00 | 0.00 | true | 0.438000;0.437401;0.437625;0.436920;0.437484 | 0;0;0;0;0 | 2688;1664;512;1280;1152 | 0;0;0;0;0 |
179 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.00 | 285752320 | 1758880.00 | 1868010.67 | 10.00 | 78.79 | 8404.48 | false | 0.099599;0.099844;0.099984;0.100062;0.099662 | 285752320;285752320;285752320;285752320;285752320 | 1651872;1703968;1811296;1789152;1783520 | 1915040;1904352;1842016;1843936;1855744 |
179 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 3904.00 | 4939552.00 | 18.50 | 3.10 | 956.93 | true | 0.182032;0.187363;0.185586;0.184830;0.185489 | 15310848;15310848;15310848;15310848;15310848 | 4879872;4897152;4991328;4967168;4954336 | 3968;3648;4672;3712;4032 |
179 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 1161984 | 223050.67 | 624277.33 | 2.00 | 1.37 | 72.62 | true | 0.019948;0.019949;0.019947;0.019934;0.019958 | 1161984;1161984;1161984;1161984;1161984 | 224416;222880;222880;223136;223136 | 663040;652832;589600;611072;608928 |
179 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204394.67 | 774517.33 | 44.70 | 0.00 | 0.00 | true | 0.444224;0.447440;0.448358;0.445421;0.449770 | 0;0;0;0;0 | 1204480;1204224;1204480;1204224;1204480 | 768608;766912;777504;777440;789472 |
179 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.33 | 1271872 | 2752.00 | 80714.67 | 2.30 | 15.24 | 293.53 | true | 0.022993;0.022992;0.022993;0.022990;0.022992 | 1271872;1271872;1271872;1271872;1271872 | 1312;3680;2976;1696;3584 | 99040;78816;80288;81216;80640 |
180 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 40.00 | 380608512 | 7073397.33 | 2589802.67 | 11.00 | 39.39 | 9515.21 | false | 0.110065;0.109946;0.110115;0.110245;0.109979 | 380608512;380608512;380608512;380608512;380608512 | 7031776;7150368;7070368;7094944;7054880 | 2606816;2630048;2562848;2586176;2576416 |
180 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 27.00 | 20414464 | 89226.67 | 6969941.33 | 19.50 | 2.89 | 756.09 | true | 0.194292;0.195908;0.196084;0.194126;0.191716 | 20414464;20414464;20414464;20414464;20414464 | 86752;92768;90400;81056;90528 | 6975968;6931136;6974496;6963168;6972160 |
180 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605824.00 | 942858.67 | 44.90 | 0.00 | 0.00 | true | 0.447735;0.449647;0.450707;0.449018;0.445344 | 0;0;0;0;0 | 899232;924000;971936;946112;958464 | 1605824;1605824;1605824;1605824;1605824 |
180 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 734581.33 | 2.30 | 1.36 | 123.24 | true | 0.023192;0.023177;0.023178;0.023168;0.023196 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259360 | 728448;752288;725728;747456;727840 |
180 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 7.00 | 1453568 | 3488.00 | 38485.33 | 2.60 | 34.63 | 207.65 | false | 0.026195;0.026178;0.026185;0.026196;0.026187 | 1453568;1453568;1453568;1453568;1453568 | 3488;3488;3488;3488;3488 | 42752;40576;37888;34560;36992 |
181 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 26.667 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 260160.00 | 36181.33 | 45.90 | 0.22 | 16.18 | true | 0.459012;0.459246;0.457250;0.459550;0.457414 | 64736;64736;64736;64736;64736 | 260160;260160;260160;260160;260160 | 33408;36864;35584;36864;36096 |
182 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 20.333 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1344.00 | 512.00 | 44.90 | 39.86 | 18.50 | false | 0.448972;0.449448;0.448319;0.450160;0.448705 | 73984;73984;73984;73984;73984 | 1344;1344;1344;1344;1344 | 128;768;128;768;640 |
183 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 18.333 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 768.00 | 768.00 | 44.00 | 0.00 | 0.00 | true | 0.439403;0.439983;0.439823;0.440232;0.439646 | 0;0;0;0;0 | 768;768;768;768;768 | 1920;768;768;768;768 |
184 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 198 | 443648 | 9783808 | 172197120 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 40.00 | 333032448 | 5086784.00 | 2248778.67 | 9.90 | 45.40 | 8325.81 | false | 0.099110;0.098853;0.099274;0.099307;0.099025 | 333032448;333032448;333032448;333032448;333032448 | 5051712;5167232;5052416;5156224;4959616 | 2335712;2268480;2194912;2210976;2266880 |
184 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 198 | 443648 | 9783808 | 172197120 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 17.00 | 17862656 | 28416.00 | 5941674.67 | 20.30 | 2.99 | 1050.74 | true | 0.202669;0.203597;0.203288;0.202864;0.205538 | 17862656;17862656;17862656;17862656;17862656 | 5869088;5919968;5980704;5995264;5924352 | 29696;28096;27456;30016;25536 |
184 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 198 | 443648 | 9783808 | 172197120 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.67 | 1355648 | 259360.00 | 741024.00 | 2.30 | 1.36 | 116.20 | true | 0.023153;0.023181;0.023180;0.023185;0.023166 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;261408;259360;259360 | 776320;748256;724416;705888;750400 |
184 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 198 | 443648 | 9783808 | 172197120 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1404928.00 | 499296.00 | 44.70 | 0.00 | 0.00 | true | 0.444016;0.447203;0.447020;0.447593;0.450386 | 0;0;0;0;0 | 1404928;1404928;1404928;1404928;1404928 | 477536;467872;511008;509344;521760 |
184 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 198 | 443648 | 9783808 | 172197120 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 1271872 | 629.33 | 44085.33 | 2.30 | 28.44 | 317.97 | false | 0.022951;0.022952;0.022948;0.022948;0.022950 | 1271872;1271872;1271872;1271872;1271872 | 43520;46208;43136;43776;44960 | 672;544;928;672;544 |
185 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 25 | 443648 | 0 | 171938048 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 554.67 | 45.70 | 36.56 | 16.18 | false | 0.456697;0.455636;0.460262;0.457610;0.457257 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;5824 | 128;1152;128;384;2176 |
186 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 19.333 | 443648 | 0 | 171938048 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 181.33 | 469.33 | 44.90 | 0.00 | 0.00 | true | 0.449148;0.449210;0.449466;0.448856;0.448739 | 0;0;0;0;0 | 96;352;96;96;1120 | 0;0;256;2304;1152 |
187 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 181 | 295936 | 1901568 | 172233984 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 87.00 | 256975104 | 5141.33 | 249920.00 | 3.10 | 1007.50 | 2953.74 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 5056;5312;5056;5312;5056 | 248576;253056;250336;250848;241120 |
187 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 181 | 295936 | 1901568 | 172233984 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605632.00 | 623296.00 | 44.60 | 0.00 | 0.00 | true | 0.444297;0.447319;0.448230;0.447130;0.444614 | 0;0;0;0;0 | 1605632;1605632;1605632;1605632;1605632 | 588288;608480;648256;636320;625088 |
188 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 25 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 73984 | 1280.00 | 40053.33 | 44.50 | 1.79 | 22.20 | true | 0.445248;0.444613;0.444333;0.445732;0.443790 | 73984;73984;73984;73984;73984 | 1280;1280;1280;1280;1280 | 40864;36256;41152;38144;42656 |
190 | InceptionV4/InceptionV4/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 22.333 | 1479936 | 0 | 172086528 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 255370.67 | 61.60 | 0.00 | 0.00 | true | 0.614747;0.615665;0.615470;0.615477;0.616250 | 0;0;0;0;0 | 256;1280;256;256;256 | 257408;256256;255232;254624;254208 |
191 | InceptionV4/InceptionV4/Mixed_6d/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 47.333 | 1183744 | 1183744 | 173270272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 15.33 | 5981137 | 3840.00 | 887904.00 | 52.60 | 6.71 | 390.08 | true | 0.527700;0.524908;0.524827;0.525669;0.526525 | 5981137;5981137;5981137;5981137;5981137 | 3840;3584;4096;3584;4352 | 887904;887904;887904;887808;888064 |
192 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 146.667 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 437877.33 | 3.10 | 287.49 | 2170.42 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 439360;441696;435680;438592;430816 |
192 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 146.667 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786666.67 | 545450.67 | 43.80 | 0.00 | 0.00 | true | 0.443609;0.434072;0.431446;0.436494;0.444636 | 0;0;0;0;0 | 543040;541664;548128;545184;552640 | 786752;786496;786752;786496;786752 |
193 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 106304.00 | 3.10 | 1184.19 | 2170.42 | false | 0.031246;0.031246;0.031246;0.031247;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;1792;0;0;0 | 115584;94816;106752;108000;104160 |
193 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 114325.33 | 43.70 | 0.00 | 0.00 | true | 0.439444;0.432712;0.441866;0.434273;0.438713 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 105728;122528;114048;111744;117184 |
194 | InceptionV4/InceptionV4/Mixed_6d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 178.333 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 86.00 | 251769216 | 0.00 | 402549.33 | 4.80 | 625.44 | 2927.55 | false | 0.047753;0.047781;0.047912;0.047702;0.047580 | 251769216;251769216;251769216;251769216;251769216 | 0;0;0;0;0 | 421536;418016;397760;376000;391872 |
194 | InceptionV4/InceptionV4/Mixed_6d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 178.333 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 1167552.00 | 44.90 | 0.00 | 0.00 | true | 0.448869;0.444720;0.457047;0.444551;0.452204 | 0;0;0;0;0 | 1149056;1152448;1172320;1191712;1177888 | 1572864;1572864;1572864;1572864;1572864 |
195 | InceptionV4/InceptionV4/Mixed_6d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152.667 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 73.33 | 83923072 | 785578.67 | 284309.33 | 3.10 | 78.44 | 1144.41 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 791808;784128;787200;784256;785280 | 285280;284096;279104;284736;284096 |
195 | InceptionV4/InceptionV4/Mixed_6d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152.667 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 524288.00 | 183722.67 | 43.00 | 0.00 | 0.00 | true | 0.427534;0.428351;0.432785;0.430407;0.430339 | 0;0;0;0;0 | 525056;524288;524288;524288;524288 | 182048;185920;184544;183328;183296 |
196 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 25.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 223530.67 | 14517.33 | 45.70 | 0.23 | 13.87 | true | 0.456998;0.457284;0.456928;0.460641;0.455417 | 55488;55488;55488;55488;55488 | 229632;221952;221952;221952;226688 | 23808;10656;9728;10752;22144 |
197 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 20.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 180693.33 | 40277.33 | 44.50 | 0.25 | 13.87 | true | 0.446268;0.446398;0.443701;0.445613;0.444523 | 55488;55488;55488;55488;55488 | 183680;179200;179072;179200;183680 | 36608;41728;43776;41984;37120 |
198 | InceptionV4/InceptionV4/Mixed_6d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 19.333 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 349706.67 | 51.80 | 0.32 | 27.74 | true | 0.519275;0.517810;0.517825;0.517830;0.519825 | 110976;110976;110976;110976;110976 | 1536;1536;1536;1536;1536 | 348736;353568;346816;355616;344000 |
199 | InceptionV4/InceptionV4/Mixed_6d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 19.333 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 36992 | 512.00 | 11648.00 | 43.50 | 3.04 | 12.33 | true | 0.435128;0.435619;0.433654;0.435249;0.433468 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 11008;16896;11648;12032;11264 |
200 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 18.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 40234.67 | 43.90 | 0.00 | 0.00 | true | 0.439821;0.439478;0.438415;0.439253;0.439386 | 0;0;0;0;0 | 0;0;0;0;0 | 37376;42112;35456;41728;41600 |
201 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 21.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 1493.33 | 43.70 | 0.00 | 0.00 | true | 0.437235;0.436792;0.437383;0.436561;0.436633 | 0;0;0;0;0 | 0;0;0;0;0 | 1792;1280;2560;640;1408 |
202 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 33.00 | 244930560 | 14880.00 | 1030325.33 | 8.70 | 234.34 | 7422.14 | false | 0.086810;0.086822;0.086753;0.086812;0.086622 | 244930560;244930560;244930560;244930560;244930560 | 14880;17184;15392;14368;14368 | 1059200;1069376;985120;1010944;1020832 |
202 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 1898.67 | 4033408.00 | 17.00 | 3.25 | 937.40 | true | 0.169584;0.168002;0.169765;0.171185;0.170156 | 13123584;13123584;13123584;13123584;13123584 | 1856;1856;2368;1984;1856 | 3961952;3948960;4109696;4075040;4063232 |
202 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 222368.00 | 715253.33 | 2.00 | 1.24 | 105.63 | true | 0.019962;0.019985;0.019978;0.019978;0.019999 | 1161984;1161984;1161984;1161984;1161984 | 222368;222240;222368;222368;222368 | 750880;750464;677504;694624;700672 |
202 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 351744.00 | 44.30 | 0.00 | 0.00 | true | 0.444187;0.443173;0.443642;0.442963;0.442615 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1032192 | 341952;368192;350752;353216;351264 |
202 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.67 | 1090176 | 1152.00 | 261034.67 | 2.00 | 4.16 | 233.59 | true | 0.019936;0.019932;0.019908;0.019878;0.019929 | 1090176;1090176;1090176;1090176;1090176 | 0;2048;256;5632;1152 | 260288;261920;260896;262880;258368 |
203 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 200.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.67 | 285752320 | 2210144.00 | 1767797.33 | 10.00 | 71.83 | 8242.78 | false | 0.099728;0.099781;0.099799;0.100088;0.099642 | 285752320;285752320;285752320;285752320;285752320 | 2125216;2233120;2207584;2253472;2189728 | 1786816;1782880;1759584;1712800;1760928 |
203 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 200.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 4170.67 | 4967477.33 | 18.60 | 3.08 | 956.93 | true | 0.185466;0.185313;0.186528;0.187900;0.186168 | 15310848;15310848;15310848;15310848;15310848 | 3488;3232;5536;4896;4128 | 4936000;4922784;4983584;5006528;4982848 |
203 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 200.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 1161984 | 228682.67 | 586080.00 | 2.00 | 1.43 | 72.62 | true | 0.019930;0.019936;0.019933;0.019971;0.019961 | 1161984;1161984;1161984;1161984;1161984 | 228512;228512;228768;231840;228768 | 608928;606496;564224;581120;570624 |
203 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 200.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204416.00 | 1051338.67 | 45.10 | 0.00 | 0.00 | true | 0.449295;0.449000;0.455463;0.450737;0.452038 | 0;0;0;0;0 | 1204416;1204416;1204416;1204416;1204416 | 998176;1042048;1058816;1055648;1056320 |
203 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 200.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 8.00 | 1271872 | 7541.33 | 111936.00 | 2.30 | 10.65 | 158.98 | true | 0.023070;0.023033;0.023034;0.023036;0.023036 | 1271872;1271872;1271872;1271872;1271872 | 6432;7456;7456;7712;7712 | 115168;107936;112960;105600;114912 |
204 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 26.667 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 223296.00 | 44554.67 | 45.90 | 0.21 | 13.87 | true | 0.459577;0.459569;0.459306;0.457309;0.458934 | 55488;55488;55488;55488;55488 | 223296;223296;223296;223296;223296 | 44928;42752;45216;44448;44288 |
205 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 20 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 554.67 | 45.30 | 36.56 | 16.18 | false | 0.459677;0.451696;0.453461;0.453247;0.453639 | 64736;64736;64736;64736;64736 | 256;896;0;512;1280 | 1216;1216;1216;1216;1216 |
206 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 19.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 1024.00 | 34410.67 | 44.00 | 0.00 | 0.00 | true | 0.439972;0.439856;0.439843;0.439926;0.439533 | 0;0;0;0;0 | 1024;1024;1024;6400;1024 | 36256;34848;32128;37120;31776 |
207 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 18.667 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 597.33 | 43.80 | 0.00 | 0.00 | true | 0.438087;0.436940;0.438178;0.436958;0.437550 | 0;0;0;0;0 | 0;0;0;0;0 | 1664;384;640;768;0 |
208 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 189 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.33 | 285752320 | 1822122.67 | 1895242.67 | 10.00 | 76.87 | 8322.96 | false | 0.099833;0.099746;0.099852;0.099768;0.100045 | 285752320;285752320;285752320;285752320;285752320 | 1823488;1785280;1770944;1865920;1857600 | 1886752;1913376;1964320;1885600;1827360 |
208 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 189 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.67 | 1161984 | 222709.33 | 594240.00 | 2.00 | 1.42 | 69.72 | true | 0.019967;0.019974;0.019994;0.019981;0.020004 | 1161984;1161984;1161984;1161984;1161984 | 222624;222624;222624;222880;222880 | 610336;626592;597600;574784;549184 |
208 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 189 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 7978.67 | 4982464.00 | 18.30 | 3.07 | 956.93 | true | 0.183788;0.186178;0.182145;0.180213;0.181841 | 15310848;15310848;15310848;15310848;15310848 | 5952;6080;9216;8640;15936 | 4960512;4947552;4971296;5015584;5063808 |
208 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 189 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204394.67 | 834741.33 | 44.40 | 0.00 | 0.00 | true | 0.442317;0.444683;0.444604;0.443924;0.444010 | 0;0;0;0;0 | 1204480;1204224;1204480;1204224;1204736 | 828096;835264;833344;856960;835616 |
208 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 189 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 1271872 | 2869.33 | 96181.33 | 2.30 | 12.84 | 317.97 | true | 0.022996;0.022972;0.022980;0.022981;0.022981 | 1271872;1271872;1271872;1271872;1271872 | 2048;2816;3200;2592;3968 | 97248;100544;91200;91936;99360 |
209 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 41.00 | 380608512 | 7058165.33 | 2662112.00 | 11.00 | 39.16 | 9283.13 | false | 0.110274;0.109878;0.110045;0.110467;0.110542 | 380608512;380608512;380608512;380608512;380608512 | 2677312;2678848;2638816;2652224;2656800 | 7048864;6992416;7090848;7034784;7091232 |
209 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 27.00 | 20414464 | 97930.67 | 7168938.67 | 19.50 | 2.81 | 756.09 | true | 0.194927;0.196755;0.192372;0.195624;0.194774 | 20414464;20414464;20414464;20414464;20414464 | 7149888;7104800;7175840;7181088;7190656 | 95072;93472;102048;96672;103968 |
209 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605824.00 | 1019861.33 | 45.10 | 0.00 | 0.00 | true | 0.450860;0.450419;0.450344;0.450661;0.450830 | 0;0;0;0;0 | 1017472;1016576;991776;1025536;1051584 | 1605824;1605824;1605824;1605824;1606848 |
209 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 641866.67 | 2.30 | 1.50 | 123.24 | true | 0.023194;0.023182;0.023187;0.023162;0.023172 | 1355648;1355648;1355648;1355648;1355648 | 648864;673568;642560;634176;614272 | 259360;259360;259360;259360;259360 |
209 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 7.00 | 1453568 | 3232.00 | 49546.67 | 2.60 | 27.54 | 207.65 | true | 0.026190;0.026201;0.026186;0.026200;0.026209 | 1453568;1453568;1453568;1453568;1453568 | 42016;41472;61312;55424;51200 | 3232;3232;3232;3232;3232 |
210 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 26 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 260160.00 | 20053.33 | 45.90 | 0.23 | 16.18 | true | 0.459532;0.460658;0.459176;0.459290;0.459553 | 64736;64736;64736;64736;64736 | 260160;260160;260160;260160;260160 | 19840;20736;20096;20224;19712 |
211 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 20 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1344.00 | 522.67 | 44.90 | 39.63 | 18.50 | false | 0.448869;0.449168;0.448576;0.453263;0.447785 | 73984;73984;73984;73984;73984 | 256;1024;256;544;768 | 1344;1344;1600;1344;1344 |
212 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 18.333 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 768.00 | 85.33 | 44.00 | 0.00 | 0.00 | true | 0.440038;0.439337;0.440244;0.439601;0.439590 | 0;0;0;0;0 | 768;768;768;768;768 | 256;1664;0;0;0 |
213 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 40.00 | 333032448 | 5167744.00 | 2275669.33 | 9.90 | 44.74 | 8325.81 | false | 0.098961;0.099139;0.098922;0.099131;0.098801 | 333032448;333032448;333032448;333032448;333032448 | 2260416;2293280;2273312;2234016;2301088 | 5178752;5176960;5147520;5190016;5123456 |
213 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 17.00 | 17862656 | 23402.67 | 5913386.67 | 20.50 | 3.01 | 1050.74 | true | 0.202485;0.204187;0.205101;0.205955;0.204619 | 17862656;17862656;17862656;17862656;17862656 | 24640;27200;22208;23360;20672 | 5923840;5880096;5936224;5974496;5853280 |
213 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1404928.00 | 491744.00 | 44.60 | 0.00 | 0.00 | true | 0.447189;0.443733;0.450224;0.446844;0.444858 | 0;0;0;0;0 | 1404928;1404928;1404928;1404928;1404928 | 480160;486336;490688;498208;510784 |
213 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 729888.00 | 2.30 | 1.37 | 123.24 | true | 0.023178;0.023180;0.023170;0.023165;0.023169 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259360 | 740288;750560;698816;673984;759104 |
213 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 1271872 | 629.33 | 16618.67 | 2.30 | 73.74 | 317.97 | false | 0.022952;0.022952;0.022980;0.022948;0.022978 | 1271872;1271872;1271872;1271872;1271872 | 672;800;544;672;544 | 15616;17600;16672;16896;16288 |
214 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 25.667 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 181.33 | 45.60 | 46.33 | 16.18 | false | 0.457108;0.455537;0.456808;0.455932;0.456037 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1472;1216 | 544;128;128;128;288 |
215 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 21.667 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 949.33 | 512.00 | 44.90 | 0.00 | 0.00 | true | 0.449410;0.448555;0.454439;0.449424;0.448559 | 0;0;0;0;0 | 96;96;96;7520;2656 | 512;128;256;768;2048 |
216 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 177 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 87.00 | 256975104 | 5397.33 | 144469.33 | 3.10 | 1714.69 | 2953.74 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 5312;5568;5312;5568;5312 | 155840;143904;145504;144000;142112 |
216 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 177 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605632.00 | 901194.67 | 44.90 | 0.00 | 0.00 | true | 0.447465;0.449219;0.456947;0.446919;0.448990 | 0;0;0;0;0 | 1605632;1605632;1605632;1605632;1605632 | 893440;899264;910752;916384;893568 |
217 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 24.667 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 73984 | 1280.00 | 12853.33 | 44.40 | 5.23 | 22.20 | true | 0.444116;0.443808;0.443506;0.443877;0.444800 | 73984;73984;73984;73984;73984 | 2816;1280;1280;1280;1280 | 13440;11520;12320;13184;13056 |
219 | InceptionV4/InceptionV4/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 21.667 | 1627648 | 0 | 172234240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 41344.00 | 61.60 | 0.00 | 0.00 | true | 0.614599;0.614992;0.615116;0.616660;0.617720 | 0;0;0;0;0 | 256;256;256;256;256 | 41600;47232;40960;39936;41472 |
220 | InceptionV4/InceptionV4/Mixed_6e/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 49 | 1479936 | 1479936 | 173714176 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 15.00 | 6232897 | 3861.33 | 680608.00 | 52.90 | 9.11 | 415.53 | true | 0.527022;0.529507;0.528120;0.528819;0.531435 | 6232897;6232897;6232897;6232897;6232897 | 3840;3648;4096;3584;4096 | 682368;683712;679520;679712;679744 |
221 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 144 | 221952 | 1008384 | 173936128 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 434901.33 | 3.10 | 289.46 | 2170.42 | false | 0.031246;0.031246;0.031247;0.031246;0.031247 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 430592;443264;435584;438528;427520 |
221 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 144 | 221952 | 1008384 | 173936128 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786645.33 | 575072.00 | 44.10 | 0.00 | 0.00 | true | 0.429310;0.446525;0.447173;0.415763;0.447779 | 0;0;0;0;0 | 786688;786432;786816;786432;791808 | 579680;566880;574592;570944;582816 |
222 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 144 | 221952 | 1008384 | 174158080 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 92053.33 | 3.10 | 1367.52 | 2170.42 | false | 0.031246;0.031247;0.031247;0.031247;0.031247 | 125884608;125884608;125884608;125884608;125884608 | 94400;91968;92224;88768;91968 | 0;0;0;0;0 |
222 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 144 | 221952 | 1008384 | 174158080 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 128970.67 | 43.90 | 0.00 | 0.00 | true | 0.435549;0.440744;0.436970;0.439184;0.439518 | 0;0;0;0;0 | 786688;786432;786432;786432;786432 | 126464;129056;128800;132512;129056 |
223 | InceptionV4/InceptionV4/Mixed_6e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 173.333 | 443904 | 2016768 | 174601984 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 86.33 | 251769216 | 15189.33 | 507157.33 | 4.80 | 482.00 | 2916.26 | false | 0.047318;0.047682;0.047526;0.047303;0.047681 | 251769216;251769216;251769216;251769216;251769216 | 509344;516192;505728;506400;499808 | 12288;15296;13696;16576;17280 |
223 | InceptionV4/InceptionV4/Mixed_6e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 173.333 | 443904 | 2016768 | 174601984 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572928.00 | 1184533.33 | 45.20 | 0.00 | 0.00 | true | 0.452262;0.454181;0.452938;0.445243;0.450505 | 0;0;0;0;0 | 1572928;1572928;1572928;1572928;1572864 | 1182752;1172448;1183552;1187296;1194080 |
224 | InceptionV4/InceptionV4/Mixed_6e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 159.333 | 147968 | 672256 | 173122304 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 73.67 | 83923072 | 777258.67 | 192117.33 | 3.10 | 86.57 | 1139.22 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 779520;779648;774784;774272;777472 | 190496;202816;190528;187072;195328 |
224 | InceptionV4/InceptionV4/Mixed_6e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 159.333 | 147968 | 672256 | 173122304 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 524309.33 | 215093.33 | 43.10 | 0.00 | 0.00 | true | 0.435406;0.423737;0.435141;0.420367;0.434028 | 0;0;0;0;0 | 524288;524288;524352;524288;524480 | 216032;207552;216576;221312;212672 |
225 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 26 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 222336.00 | 682.67 | 45.70 | 0.25 | 13.87 | true | 0.458850;0.455151;0.455690;0.455635;0.466867 | 55488;55488;55488;55488;55488 | 222336;221952;222848;221824;222720 | 768;512;1024;512;768 |
226 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 19.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 222720.00 | 95573.33 | 44.70 | 0.17 | 13.87 | true | 0.448096;0.447951;0.445988;0.446309;0.445526 | 55488;55488;55488;55488;55488 | 95744;95872;95488;95360;95488 | 222720;222720;222720;222720;222720 |
227 | InceptionV4/InceptionV4/Mixed_6e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 22 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 255562.67 | 51.90 | 0.43 | 27.74 | true | 0.518895;0.520233;0.519062;0.518056;0.521215 | 110976;110976;110976;110976;110976 | 1536;1536;3072;1536;1536 | 256416;256160;229536;254112;257056 |
228 | InceptionV4/InceptionV4/Mixed_6e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 18 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 36992 | 512.00 | 7381.33 | 43.60 | 4.69 | 12.33 | true | 0.439183;0.435674;0.435908;0.433717;0.435296 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 7168;5760;6656;9856;8320 |
229 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 18.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 94336.00 | 44.00 | 0.00 | 0.00 | true | 0.439251;0.439239;0.439903;0.439685;0.439665 | 0;0;0;0;0 | 0;0;0;0;0 | 94080;94464;94208;94336;94592 |
230 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 18.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 256.00 | 43.70 | 0.00 | 0.00 | true | 0.436901;0.437155;0.436637;0.436737;0.437350 | 0;0;0;0;0 | 0;0;0;0;0 | 256;256;256;256;256 |
231 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187.667 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 33.00 | 244930560 | 9141.33 | 994528.00 | 8.70 | 244.04 | 7422.14 | false | 0.086735;0.086883;0.086751;0.086778;0.086583 | 244930560;244930560;244930560;244930560;244930560 | 9184;9568;8672;9952;7008 | 1048640;1015968;965280;949760;1002336 |
231 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187.667 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 1749.33 | 4044117.33 | 17.00 | 3.24 | 937.40 | true | 0.170733;0.167178;0.170232;0.170620;0.169147 | 13123584;13123584;13123584;13123584;13123584 | 1664;1536;1664;1920;2048 | 3941760;3982784;4125664;4138688;4023904 |
231 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187.667 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 222325.33 | 666101.33 | 2.00 | 1.31 | 105.63 | true | 0.019971;0.019982;0.019964;0.019963;0.019945 | 1161984;1161984;1161984;1161984;1161984 | 222368;222240;222112;222368;222368 | 710080;698048;621984;625920;674336 |
231 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187.667 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 340693.33 | 44.30 | 0.00 | 0.00 | true | 0.442376;0.443341;0.444556;0.441229;0.442073 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1032192 | 340864;336800;333952;347616;344416 |
231 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187.667 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1090176 | 5674.67 | 276021.33 | 2.00 | 3.87 | 218.04 | true | 0.019879;0.019933;0.019902;0.019861;0.019934 | 1090176;1090176;1090176;1090176;1090176 | 5184;6336;5952;5824;5248 | 275552;276416;275008;276096;278560 |
232 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 194.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.33 | 285752320 | 1976522.67 | 1823957.33 | 10.00 | 75.19 | 8322.96 | false | 0.099992;0.099883;0.099918;0.099718;0.099700 | 285752320;285752320;285752320;285752320;285752320 | 1925920;1926944;1992352;2050592;2010272 | 1846144;1830368;1799904;1802336;1839168 |
232 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 194.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 5280.00 | 4949280.00 | 18.90 | 3.09 | 956.93 | true | 0.187789;0.189071;0.189207;0.187547;0.189294 | 15310848;15310848;15310848;15310848;15310848 | 5664;4128;5408;4768;9632 | 4857088;4873152;4999136;5007488;4975552 |
232 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 194.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 1161984 | 228426.67 | 620234.67 | 2.00 | 1.37 | 72.62 | true | 0.019931;0.019932;0.019928;0.019952;0.019934 | 1161984;1161984;1161984;1161984;1161984 | 228256;228256;228512;229536;228512 | 647232;657632;604672;601376;608800 |
232 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 194.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204416.00 | 1071776.00 | 44.90 | 0.00 | 0.00 | true | 0.449649;0.451555;0.448077;0.449904;0.448925 | 0;0;0;0;0 | 1204416;1204416;1204416;1204416;1204416 | 1059200;1066464;1072064;1076800;1078816 |
232 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 194.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 8.00 | 1271872 | 12800.00 | 68490.67 | 2.30 | 15.65 | 158.98 | true | 0.023044;0.023045;0.023038;0.023026;0.023039 | 1271872;1271872;1271872;1271872;1271872 | 12672;12032;12320;13408;17152 | 66720;70944;68352;68512;68608 |
233 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 26.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 223296.00 | 17568.00 | 45.80 | 0.23 | 13.87 | true | 0.457600;0.458976;0.457220;0.458588;0.456467 | 55488;55488;55488;55488;55488 | 223296;223296;223296;223296;223296 | 17312;18688;16256;17696;17696 |
234 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 22.667 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 23690.67 | 45.40 | 2.60 | 16.18 | true | 0.454668;0.453833;0.453568;0.453563;0.452509 | 64736;64736;64736;64736;64736 | 22272;26112;22304;24448;24320 | 1216;1216;1216;1216;1216 |
235 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 18 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 1024.00 | 3178.67 | 44.00 | 0.00 | 0.00 | true | 0.439587;0.439522;0.439295;0.439512;0.439837 | 0;0;0;0;0 | 1024;1024;1024;1024;1024 | 3968;2304;2208;3264;4192 |
236 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 21.667 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 725.33 | 43.70 | 0.00 | 0.00 | true | 0.437257;0.437384;0.437290;0.437500;0.437663 | 0;0;0;0;0 | 0;0;0;0;0 | 512;1024;1024;512;640 |
237 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.333 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.00 | 285752320 | 1533482.67 | 1843029.33 | 10.00 | 84.63 | 8404.48 | false | 0.099700;0.099740;0.100022;0.099992;0.099939 | 285752320;285752320;285752320;285752320;285752320 | 1576096;1518464;1511424;1560064;1521920 | 1840320;1852384;1849888;1838880;1821856 |
237 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.333 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 4544.00 | 4990826.67 | 18.40 | 3.07 | 956.93 | true | 0.187051;0.182107;0.184115;0.181928;0.186330 | 15310848;15310848;15310848;15310848;15310848 | 4928;4544;4544;4544;3776 | 4977664;4976992;5017824;5023424;4967680 |
237 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.333 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 1161984 | 222965.33 | 643680.00 | 2.00 | 1.34 | 72.62 | true | 0.019962;0.019948;0.019962;0.019957;0.019980 | 1161984;1161984;1161984;1161984;1161984 | 222880;222880;222880;223136;223136 | 655616;668864;630720;630592;644704 |
237 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.333 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204480.00 | 835370.67 | 44.70 | 0.00 | 0.00 | true | 0.445541;0.448992;0.451862;0.446507;0.445370 | 0;0;0;0;0 | 1206528;1204224;1204480;1204224;1204736 | 833184;831360;841568;822720;844128 |
237 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.333 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 24085.33 | 74784.00 | 2.30 | 12.86 | 254.37 | true | 0.022981;0.022988;0.022985;0.022991;0.022983 | 1271872;1271872;1271872;1271872;1271872 | 19776;21664;24384;26208;28352 | 75968;74080;74304;73920;82720 |
238 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 210 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 41.00 | 380608512 | 6973301.33 | 2660928.00 | 11.00 | 39.51 | 9283.13 | false | 0.110149;0.110410;0.110215;0.110323;0.110264 | 380608512;380608512;380608512;380608512;380608512 | 6971680;7002528;6968864;6979360;6936480 | 2666816;2699840;2627616;2653248;2662720 |
238 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 210 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 26.67 | 20414464 | 69173.33 | 6982709.33 | 19.70 | 2.89 | 765.53 | true | 0.197624;0.199520;0.194553;0.196200;0.195966 | 20414464;20414464;20414464;20414464;20414464 | 68960;70496;72416;68064;66016 | 6960448;6915616;7026656;7015808;6971872 |
238 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 210 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605824.00 | 906474.67 | 45.00 | 0.00 | 0.00 | true | 0.455274;0.448366;0.450173;0.450460;0.449972 | 0;0;0;0;0 | 1605824;1605824;1605824;1605824;1605824 | 913440;904576;901408;900768;918400 |
238 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 210 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 724416.00 | 2.30 | 1.38 | 123.24 | true | 0.023172;0.023175;0.023192;0.023182;0.023176 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;262432;259360;259360 | 734400;750912;670176;716352;722496 |
238 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 210 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 7.00 | 1453568 | 3488.00 | 67200.00 | 2.60 | 20.56 | 207.65 | true | 0.026206;0.026184;0.026191;0.026187;0.026190 | 1453568;1453568;1453568;1453568;1453568 | 3488;3488;3488;3488;3488 | 62336;61824;73344;70912;68352 |
239 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 26 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 260160.00 | 32042.67 | 45.90 | 0.22 | 16.18 | true | 0.457663;0.461118;0.458483;0.459144;0.458541 | 64736;64736;64736;64736;64736 | 31616;33664;32640;31872;31104 | 260160;260160;260160;260160;260160 |
240 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 19.667 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1344.00 | 2133.33 | 45.00 | 21.28 | 18.50 | true | 0.450667;0.448395;0.454030;0.449443;0.448610 | 73984;73984;73984;73984;73984 | 1344;1344;1344;6720;1344 | 2048;1536;2304;2048;2816 |
241 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 18.667 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 768.00 | 469.33 | 44.00 | 0.00 | 0.00 | true | 0.439632;0.439918;0.439317;0.440114;0.439158 | 0;0;0;0;0 | 768;768;768;768;768 | 512;896;1280;0;0 |
242 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 40.00 | 333032448 | 5138005.33 | 2244405.33 | 9.90 | 45.11 | 8325.81 | false | 0.099108;0.098839;0.098930;0.098873;0.099342 | 333032448;333032448;333032448;333032448;333032448 | 5150080;5141504;5122432;4900096;5250176 | 2268512;2302656;2219328;2245376;2218112 |
242 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 17.00 | 17862656 | 27968.00 | 5939808.00 | 20.30 | 2.99 | 1050.74 | true | 0.201925;0.200085;0.203370;0.203520;0.204291 | 17862656;17862656;17862656;17862656;17862656 | 27328;25152;27968;28608;30528 | 5906304;5853568;5968256;5958560;5954560 |
242 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1404928.00 | 398602.67 | 44.40 | 0.00 | 0.00 | true | 0.445780;0.444849;0.442543;0.440904;0.448651 | 0;0;0;0;0 | 1404928;1404928;1404928;1406464;1404928 | 414592;373760;405504;390016;400288 |
242 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 779829.33 | 2.30 | 1.30 | 123.24 | true | 0.023197;0.023192;0.023174;0.023169;0.023167 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;261152 | 802880;824224;763296;763072;773312 |
242 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 1271872 | 416.00 | 29077.33 | 2.30 | 43.12 | 317.97 | false | 0.022936;0.022952;0.022953;0.022954;0.022953 | 1271872;1271872;1271872;1271872;1271872 | 416;416;256;416;416 | 27008;30368;29856;30592;26240 |
243 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 26 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1301.33 | 0.00 | 45.70 | 49.75 | 16.18 | false | 0.463650;0.456675;0.457066;0.456643;0.457560 | 64736;64736;64736;64736;64736 | 1472;1216;1216;1472;1216 | 0;0;256;0;0 |
244 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 19.667 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 96.00 | 0.00 | 44.90 | 0.00 | 0.00 | true | 0.448071;0.448757;0.448699;0.449365;0.448343 | 0;0;0;0;0 | 96;96;96;96;96 | 0;0;0;0;0 |
245 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 180.667 | 295936 | 1901568 | 172271360 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 87.00 | 256975104 | 5141.33 | 234464.00 | 3.10 | 1072.49 | 2953.74 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 5056;8640;5056;5312;5056 | 236928;233920;231424;233536;235936 |
245 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 180.667 | 295936 | 1901568 | 172271360 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605632.00 | 617706.67 | 45.00 | 0.00 | 0.00 | true | 0.448464;0.452103;0.446589;0.454401;0.449561 | 0;0;0;0;0 | 1605632;1605632;1605632;1605632;1605632 | 606016;608032;627776;617312;629952 |
246 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 25 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 73984 | 1280.00 | 34058.67 | 44.40 | 2.09 | 22.20 | true | 0.443171;0.443331;0.444119;0.444465;0.443716 | 73984;73984;73984;73984;73984 | 33312;31360;39296;35712;33152 | 1280;1280;1280;1280;1280 |
248 | InceptionV4/InceptionV4/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 22.333 | 1479936 | 0 | 172086528 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 132512.00 | 61.60 | 0.00 | 0.00 | true | 0.616089;0.617270;0.616105;0.614920;0.613729 | 0;0;0;0;0 | 256;256;256;256;256 | 132256;132512;132640;132384;132896 |
249 | InceptionV4/InceptionV4/Mixed_6f/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 46 | 1183744 | 1183744 | 173270272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 15.00 | 5969482 | 3840.00 | 1050784.00 | 52.50 | 5.66 | 397.97 | true | 0.525128;0.523203;0.525157;0.526820;0.525363 | 5969482;5969482;5969482;5969482;5969482 | 3840;3584;4096;3584;9216 | 1046592;1053216;1050208;1053568;1048928 |
250 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 148 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 355221.33 | 3.10 | 354.38 | 2170.42 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 361152;355776;355328;352928;354560 |
250 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 148 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786666.67 | 548704.00 | 43.40 | 0.00 | 0.00 | true | 0.426341;0.433225;0.435911;0.438673;0.432129 | 0;0;0;0;0 | 786752;786496;786752;786496;786752 | 543008;548352;547904;552032;549856 |
251 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 139.667 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 121749.33 | 3.10 | 1033.97 | 2170.42 | false | 0.031246;0.031245;0.031246;0.031245;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 123072;121408;119648;122176;121664 | 0;0;0;0;0 |
251 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 139.667 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 100736.00 | 43.90 | 0.00 | 0.00 | true | 0.437294;0.441638;0.441352;0.438507;0.436690 | 0;0;0;0;0 | 100000;101600;95552;100608;102592 | 791552;786432;786432;786432;786432 |
252 | InceptionV4/InceptionV4/Mixed_6f/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 175 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 86.33 | 251769216 | 0.00 | 361898.67 | 4.70 | 695.69 | 2916.26 | false | 0.047689;0.047444;0.047288;0.047943;0.047324 | 251769216;251769216;251769216;251769216;251769216 | 0;0;0;0;0 | 347360;379296;360288;360864;364544 |
252 | InceptionV4/InceptionV4/Mixed_6f/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 175 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 1227701.33 | 44.70 | 0.00 | 0.00 | true | 0.446738;0.444415;0.448855;0.452017;0.433761 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 1238176;1211712;1236768;1222912;1223424 |
253 | InceptionV4/InceptionV4/Mixed_6f/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 156 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 73.00 | 83923072 | 765952.00 | 273472.00 | 3.10 | 80.74 | 1149.63 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 763520;772352;759424;767872;766464 | 271296;275648;276928;256576;273472 |
253 | InceptionV4/InceptionV4/Mixed_6f/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 156 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 524288.00 | 194677.33 | 42.70 | 0.00 | 0.00 | true | 0.429812;0.417600;0.425819;0.425407;0.436921 | 0;0;0;0;0 | 524288;524288;524288;524288;524288 | 196928;192160;191072;211584;194944 |
254 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 26 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 222976.00 | 15146.67 | 45.70 | 0.23 | 13.87 | true | 0.456153;0.457791;0.457785;0.455727;0.455998 | 55488;55488;55488;55488;55488 | 222976;222976;222976;222976;222976 | 15232;14976;15104;15104;15232 |
255 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 23.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 207872.00 | 34997.33 | 44.70 | 0.23 | 13.87 | true | 0.449801;0.448121;0.445835;0.446096;0.445494 | 55488;55488;55488;55488;55488 | 207872;207872;207872;207872;207872 | 34464;35072;34432;35584;35456 |
256 | InceptionV4/InceptionV4/Mixed_6f/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 20.333 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 355434.67 | 51.90 | 0.31 | 27.74 | true | 0.518143;0.520022;0.517945;0.517988;0.521751 | 110976;110976;110976;110976;110976 | 1536;6656;1536;1536;1536 | 352448;356256;356160;355776;354368 |
257 | InceptionV4/InceptionV4/Mixed_6f/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 19.667 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 36992 | 512.00 | 12074.67 | 43.50 | 2.94 | 12.33 | true | 0.434729;0.434815;0.434435;0.435785;0.436240 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 8704;9472;17536;20864;9216 |
258 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 18.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 41642.67 | 44.00 | 0.00 | 0.00 | true | 0.438923;0.439359;0.440050;0.440029;0.439477 | 0;0;0;0;0 | 0;0;0;0;0 | 43648;39808;40448;40832;43648 |
259 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 18 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 1664.00 | 43.70 | 0.00 | 0.00 | true | 0.437899;0.437120;0.437043;0.437053;0.437809 | 0;0;0;0;0 | 0;0;0;0;0 | 1536;1792;2048;1664;1280 |
260 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 33.00 | 244930560 | 17333.33 | 1053002.67 | 8.70 | 228.84 | 7422.14 | false | 0.086919;0.086794;0.086597;0.086608;0.086658 | 244930560;244930560;244930560;244930560;244930560 | 29728;13792;16416;21792;11168 | 1072032;1056832;1039392;1041440;1060736 |
260 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 2410.67 | 4070069.33 | 17.00 | 3.22 | 937.40 | true | 0.171638;0.168705;0.171193;0.169836;0.168148 | 13123584;13123584;13123584;13123584;13123584 | 2496;1920;2624;2624;2112 | 4038400;3978816;4091040;4102240;4080768 |
260 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 222368.00 | 706560.00 | 2.00 | 1.25 | 105.63 | true | 0.020004;0.019986;0.019993;0.019971;0.019971 | 1161984;1161984;1161984;1161984;1161984 | 222368;222368;222368;222368;222368 | 721632;768160;697280;700768;692288 |
260 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 257354.67 | 44.00 | 0.00 | 0.00 | true | 0.436982;0.439727;0.441804;0.440522;0.438862 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1033728 | 255072;248256;256416;262464;260576 |
260 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1090176 | 320.00 | 262378.67 | 2.00 | 4.15 | 218.04 | true | 0.019870;0.019928;0.019912;0.019860;0.019931 | 1090176;1090176;1090176;1090176;1090176 | 640;256;320;256;384 | 261024;260768;265344;256352;267168 |
261 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.67 | 285752320 | 2182346.67 | 1770048.00 | 10.00 | 72.30 | 8242.78 | false | 0.099725;0.099369;0.099873;0.099821;0.099968 | 285752320;285752320;285752320;285752320;285752320 | 1776096;1777600;1773344;1760704;1746080 | 2189280;2166944;2190944;2190816;2035040 |
261 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.33 | 1161984 | 228597.33 | 572874.67 | 2.00 | 1.45 | 71.14 | true | 0.019968;0.019935;0.019948;0.019926;0.019995 | 1161984;1161984;1161984;1161984;1161984 | 228512;228512;228768;228512;228768 | 589568;578976;560672;564768;574880 |
261 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 5770.67 | 4982368.00 | 18.70 | 3.07 | 956.93 | true | 0.184498;0.189171;0.189856;0.187009;0.185404 | 15310848;15310848;15310848;15310848;15310848 | 4256;7200;5344;6432;5536 | 4948576;4955200;5002496;5013536;4989408 |
261 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204416.00 | 1047029.33 | 45.00 | 0.00 | 0.00 | true | 0.446822;0.452095;0.450955;0.450829;0.445828 | 0;0;0;0;0 | 1204416;1204416;1204416;1204416;1206464 | 1018272;1046112;1048704;1048000;1046976 |
261 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 8.00 | 1271872 | 6986.67 | 108234.67 | 2.30 | 11.04 | 158.98 | true | 0.023038;0.023036;0.023033;0.023036;0.023034 | 1271872;1271872;1271872;1271872;1271872 | 6688;7008;6912;7040;7616 | 110944;107584;106176;103232;114624 |
262 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 29.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 223978.67 | 41578.67 | 46.00 | 0.21 | 13.87 | true | 0.458938;0.457731;0.463646;0.458177;0.465262 | 55488;55488;55488;55488;55488 | 42400;41632;41888;40480;41216 | 230720;223296;223296;223296;225344 |
263 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 19.667 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 896.00 | 45.40 | 30.65 | 16.18 | false | 0.453401;0.462903;0.454595;0.452407;0.453204 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 768;1024;384;896;1664 |
264 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 21 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 1024.00 | 31754.67 | 44.00 | 0.00 | 0.00 | true | 0.439942;0.439690;0.439909;0.440458;0.439751 | 0;0;0;0;0 | 31488;30976;31872;32128;31904 | 1024;1024;1024;1024;1024 |
265 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 18.333 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 682.67 | 43.80 | 0.00 | 0.00 | true | 0.437943;0.437926;0.437662;0.437975;0.437660 | 0;0;0;0;0 | 0;0;0;0;0 | 1664;768;512;768;384 |
266 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.333 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.00 | 285752320 | 1771360.00 | 1876138.67 | 10.00 | 78.34 | 8404.48 | false | 0.100048;0.100005;0.100030;0.099937;0.099748 | 285752320;285752320;285752320;285752320;285752320 | 1838464;1724096;1827200;1754208;1732672 | 1985440;1869728;1885984;1872704;1849856 |
266 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.333 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.67 | 1161984 | 222709.33 | 637354.67 | 2.00 | 1.35 | 69.72 | true | 0.019959;0.019997;0.019961;0.019953;0.019953 | 1161984;1161984;1161984;1161984;1161984 | 646784;648768;641152;624128;606208 | 222624;222624;222624;222880;222880 |
266 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.333 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 5098.67 | 4923957.33 | 18.30 | 3.11 | 956.93 | true | 0.183175;0.183195;0.187093;0.182241;0.182674 | 15310848;15310848;15310848;15310848;15310848 | 7808;4800;4608;5632;4864 | 4851584;4916640;4933216;4922016;4980256 |
266 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.333 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204394.67 | 855520.00 | 44.70 | 0.00 | 0.00 | true | 0.446500;0.448415;0.449132;0.445170;0.447029 | 0;0;0;0;0 | 1204480;1204224;1204480;1204224;1204480 | 851520;843488;866560;860288;854752 |
266 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.333 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 1271872 | 3424.00 | 98496.00 | 2.30 | 12.48 | 317.97 | true | 0.022980;0.022980;0.022980;0.022978;0.022983 | 1271872;1271872;1271872;1271872;1271872 | 2976;3712;2880;3584;4736 | 92000;101600;91648;101888;101920 |
267 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 41.00 | 380608512 | 7055861.33 | 2648853.33 | 11.00 | 39.22 | 9283.13 | false | 0.109986;0.110140;0.110446;0.110012;0.110030 | 380608512;380608512;380608512;380608512;380608512 | 2674112;2658464;2648256;2639840;2637984 | 7032480;7021728;7071520;7091104;7063584 |
267 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 27.00 | 20414464 | 99680.00 | 7176448.00 | 19.60 | 2.81 | 756.09 | true | 0.195122;0.198280;0.195853;0.194968;0.195536 | 20414464;20414464;20414464;20414464;20414464 | 97760;103072;100832;99872;98336 | 7148256;7168960;7196576;7185664;7174720 |
267 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605824.00 | 1029653.33 | 45.00 | 0.00 | 0.00 | true | 0.449152;0.450511;0.445454;0.449451;0.451998 | 0;0;0;0;0 | 1605824;1605824;1605824;1605824;1605824 | 997696;1043136;1007872;1053568;1037952 |
267 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 637728.00 | 2.30 | 1.51 | 123.24 | true | 0.023168;0.023207;0.023178;0.023191;0.023190 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;261664;259360;259360 | 643232;640736;635904;618240;636544 |
267 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 7.00 | 1453568 | 3232.00 | 45408.00 | 2.60 | 29.88 | 207.65 | false | 0.026211;0.026205;0.026206;0.026201;0.026207 | 1453568;1453568;1453568;1453568;1453568 | 3232;3232;3232;3232;3232 | 41248;35200;43904;51072;53504 |
268 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 25.333 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 260160.00 | 20224.00 | 46.10 | 0.23 | 16.18 | true | 0.461423;0.462379;0.459761;0.458679;0.461135 | 64736;64736;64736;64736;64736 | 21504;23808;19200;17152;19968 | 260160;260160;260160;260160;262976 |
269 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 20.667 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1344.00 | 512.00 | 44.80 | 39.86 | 18.50 | false | 0.447643;0.460075;0.448850;0.448534;0.448093 | 73984;73984;73984;73984;73984 | 1344;1344;1344;1344;1344 | 384;640;384;640;512 |
270 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 18.333 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 768.00 | 42.67 | 44.00 | 0.00 | 0.00 | true | 0.440390;0.439859;0.440097;0.439318;0.440000 | 0;0;0;0;0 | 768;768;768;768;768 | 0;128;0;0;384 |
271 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 40.00 | 333032448 | 5073877.33 | 2284906.67 | 9.90 | 45.26 | 8325.81 | false | 0.098665;0.099112;0.099098;0.099045;0.099039 | 333032448;333032448;333032448;333032448;333032448 | 2314208;2276384;2279552;2199904;2298784 | 4980096;5214208;5078016;5163520;4881792 |
271 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 17.00 | 17862656 | 23360.00 | 5907818.67 | 20.50 | 3.01 | 1050.74 | true | 0.205573;0.205420;0.201747;0.205629;0.204154 | 17862656;17862656;17862656;17862656;17862656 | 27072;25792;22336;21952;21056 | 5892448;5907584;5923424;5965152;5883008 |
271 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1404928.00 | 513685.33 | 44.40 | 0.00 | 0.00 | true | 0.444571;0.442370;0.443062;0.445800;0.444441 | 0;0;0;0;0 | 1404928;1404928;1404928;1404928;1404928 | 516032;515136;509888;517568;509248 |
271 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 724160.00 | 2.30 | 1.38 | 123.24 | true | 0.023176;0.023175;0.023167;0.023158;0.023167 | 1355648;1355648;1355648;1355648;1355648 | 259360;266272;259360;259360;259360 | 727744;742720;711744;706752;732992 |
271 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 1271872 | 714.67 | 17162.67 | 2.30 | 71.14 | 317.97 | false | 0.022952;0.022963;0.022950;0.022950;0.022951 | 1271872;1271872;1271872;1271872;1271872 | 544;2464;672;800;672 | 18048;15392;18336;15840;17600 |
272 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 25 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 256.00 | 45.70 | 43.98 | 16.18 | false | 0.456470;0.457308;0.456665;0.456616;0.456641 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 1024;384;256;128;128 |
273 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 18 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 96.00 | 85.33 | 44.90 | 0.00 | 0.00 | true | 0.449504;0.448633;0.449224;0.459501;0.448715 | 0;0;0;0;0 | 96;352;96;96;96 | 0;0;128;128;384 |
274 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 178.667 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 87.00 | 256975104 | 5397.33 | 162026.67 | 3.10 | 1534.88 | 2953.74 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 5312;5568;5312;5568;5312 | 179360;173088;146336;165376;147616 |
274 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 178.667 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605632.00 | 909269.33 | 44.60 | 0.00 | 0.00 | true | 0.444868;0.446859;0.443054;0.446593;0.446143 | 0;0;0;0;0 | 1605632;1605632;1605632;1605632;1606144 | 867968;876928;931712;945920;919168 |
275 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 24.667 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1280.00 | 12768.00 | 44.40 | 5.27 | 18.50 | true | 0.444217;0.444745;0.443035;0.443319;0.444059 | 73984;73984;73984;73984;73984 | 1280;1280;1280;1280;1280 | 12928;12832;14080;12416;12544 |
277 | InceptionV4/InceptionV4/Mixed_6f/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 21.667 | 1627648 | 0 | 172234240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 34517.33 | 61.40 | 0.00 | 0.00 | true | 0.611051;0.608726;0.614904;0.614713;0.619001 | 0;0;0;0;0 | 5888;256;256;256;256 | 35072;35328;33792;33536;34688 |
278 | InceptionV4/InceptionV4/Mixed_6g/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 46.333 | 1479936 | 1479936 | 173714176 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 15.00 | 5965762 | 3925.33 | 672618.67 | 52.90 | 8.82 | 397.72 | true | 0.529103;0.529715;0.528992;0.529386;0.533859 | 5965762;5965762;5965762;5965762;5965762 | 3904;3584;4160;3712;4160 | 674336;679872;667712;675200;668320 |
279 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 146.333 | 221952 | 1008384 | 173936128 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 430826.67 | 3.10 | 292.19 | 2170.42 | false | 0.031246;0.031247;0.031247;0.031247;0.031247 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 423296;434304;437312;433344;424832 |
279 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 146.333 | 221952 | 1008384 | 173936128 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786602.67 | 579648.00 | 43.70 | 0.00 | 0.00 | true | 0.447837;0.430802;0.439875;0.439413;0.432338 | 0;0;0;0;0 | 587136;576320;573120;576992;585632 | 786688;786432;786688;786432;786688 |
280 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 140.333 | 221952 | 1008384 | 174158080 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 104586.67 | 3.10 | 1203.64 | 2170.42 | false | 0.031247;0.031246;0.031246;0.031247;0.031247 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 106176;104160;103424;101408;108544 |
280 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 140.333 | 221952 | 1008384 | 174158080 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 118410.67 | 44.40 | 0.00 | 0.00 | true | 0.434149;0.445292;0.447792;0.443002;0.442664 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 119072;117536;118272;120224;117888 |
281 | InceptionV4/InceptionV4/Mixed_6g/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 178.667 | 443904 | 2016768 | 174601984 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 86.67 | 251769216 | 15061.33 | 525802.67 | 4.80 | 465.49 | 2905.02 | false | 0.047282;0.047616;0.047498;0.047881;0.047697 | 251769216;251769216;251769216;251769216;251769216 | 531296;532096;525472;520640;520032 | 14208;16256;15872;13568;15104 |
281 | InceptionV4/InceptionV4/Mixed_6g/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 178.667 | 443904 | 2016768 | 174601984 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572928.00 | 1165205.33 | 45.20 | 0.00 | 0.00 | true | 0.446577;0.464476;0.451809;0.449086;0.454539 | 0;0;0;0;0 | 1153248;1157728;1169248;1171392;1168640 | 1572928;1572928;1572928;1572928;1572864 |
282 | InceptionV4/InceptionV4/Mixed_6g/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 157.333 | 147968 | 672256 | 173122304 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 74.00 | 83923072 | 811008.00 | 223989.33 | 3.10 | 81.09 | 1134.10 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 809344;808320;813312;812288;811392 | 229088;223360;218368;230528;219520 |
282 | InceptionV4/InceptionV4/Mixed_6g/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 157.333 | 147968 | 672256 | 173122304 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 524480.00 | 185258.67 | 43.20 | 0.00 | 0.00 | true | 0.429597;0.434842;0.435977;0.423816;0.431413 | 0;0;0;0;0 | 524480;529408;524480;524288;524480 | 183328;185888;186560;181984;188960 |
283 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 26.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 222976.00 | 597.33 | 45.70 | 0.25 | 13.87 | true | 0.457312;0.456925;0.455079;0.456500;0.458466 | 55488;55488;55488;55488;55488 | 222976;222976;222976;222976;222976 | 512;640;640;640;512 |
284 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 20.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 194304.00 | 95125.33 | 44.60 | 0.19 | 13.87 | true | 0.446492;0.446101;0.445935;0.445506;0.445479 | 55488;55488;55488;55488;55488 | 194304;194304;194304;194304;194304 | 95392;95008;94976;94880;97056 |
285 | InceptionV4/InceptionV4/Mixed_6g/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 19.667 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 253098.67 | 51.90 | 0.44 | 27.74 | true | 0.519544;0.521803;0.515702;0.519127;0.518676 | 110976;110976;110976;110976;110976 | 1536;1536;1536;1536;1792 | 255872;255104;256416;243328;248320 |
286 | InceptionV4/InceptionV4/Mixed_6g/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 18.667 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 36992 | 512.00 | 9429.33 | 43.50 | 3.72 | 12.33 | true | 0.435164;0.435197;0.435921;0.436598;0.434099 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 7552;5120;11392;9344;12672 |
287 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 18.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 93781.33 | 43.90 | 0.00 | 0.00 | true | 0.438978;0.439620;0.439605;0.438479;0.438748 | 0;0;0;0;0 | 0;0;0;4352;0 | 93568;93824;93312;93952;94208 |
288 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 21.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 256.00 | 43.70 | 0.00 | 0.00 | true | 0.436918;0.436170;0.436957;0.436543;0.437214 | 0;0;0;0;0 | 0;0;0;0;0 | 256;256;256;256;256 |
289 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 33.00 | 244930560 | 9354.67 | 1001120.00 | 8.70 | 242.39 | 7422.14 | false | 0.086508;0.086728;0.086395;0.086796;0.086671 | 244930560;244930560;244930560;244930560;244930560 | 1018304;1017248;968960;1016032;970080 | 9696;10080;9824;7520;8544 |
289 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 2133.33 | 4033216.00 | 17.00 | 3.25 | 937.40 | true | 0.168984;0.169350;0.168582;0.170622;0.170375 | 13123584;13123584;13123584;13123584;13123584 | 1856;7296;2432;1984;1984 | 4009440;4027072;4063136;3990400;4095808 |
289 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 222368.00 | 706005.33 | 2.00 | 1.25 | 105.63 | true | 0.019974;0.019999;0.019956;0.019955;0.019983 | 1161984;1161984;1161984;1161984;1161984 | 222368;222368;222112;222368;222368 | 708800;707424;701792;722688;659616 |
289 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 244896.00 | 44.10 | 0.00 | 0.00 | true | 0.441027;0.441506;0.440172;0.442681;0.441027 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1032192 | 238304;231648;259936;240736;255648 |
289 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1090176 | 5888.00 | 279530.67 | 2.00 | 3.82 | 218.04 | true | 0.019864;0.019926;0.019925;0.019864;0.019923 | 1090176;1090176;1090176;1090176;1090176 | 275904;274688;278432;289344;284256 | 6464;5504;5760;5952;5952 |
290 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 199.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.33 | 285752320 | 1925920.00 | 1812128.00 | 10.00 | 76.44 | 8322.96 | false | 0.099632;0.099718;0.099528;0.099672;0.099710 | 285752320;285752320;285752320;285752320;285752320 | 1823392;1785888;1807136;1843008;1805856 | 1879584;1920928;1951520;1905312;1985440 |
290 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 199.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 5280.00 | 4937109.33 | 18.80 | 3.10 | 956.93 | true | 0.186516;0.184474;0.188387;0.187913;0.188206 | 15310848;15310848;15310848;15310848;15310848 | 5152;5280;5152;5408;5920 | 4896448;4917376;4965856;4930400;4963552 |
290 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 199.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 1161984 | 228341.33 | 625984.00 | 2.00 | 1.36 | 72.62 | true | 0.019942;0.019957;0.019943;0.019931;0.019936 | 1161984;1161984;1161984;1161984;1161984 | 631520;637632;613824;632608;609920 | 228256;228256;228512;228256;228512 |
290 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 199.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.33 | 0 | 1204416.00 | 1076480.00 | 45.00 | 0.00 | 0.00 | true | 0.447559;0.449410;0.453365;0.449942;0.451936 | 0;0;0;0;0 | 1204416;1204416;1204416;1204416;1204416 | 1070656;1066528;1093536;1066560;1092224 |
290 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 199.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 8.00 | 1271872 | 14165.33 | 69120.00 | 2.30 | 15.27 | 158.98 | true | 0.023035;0.023036;0.023039;0.023034;0.023041 | 1271872;1271872;1271872;1271872;1271872 | 13728;11648;23744;14912;13856 | 62944;70240;70976;66144;72320 |
291 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 26 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 223296.00 | 18709.33 | 45.80 | 0.23 | 13.87 | true | 0.459383;0.457072;0.458899;0.457955;0.458444 | 55488;55488;55488;55488;55488 | 19648;19072;16032;19584;17472 | 223296;223296;223296;223296;223296 |
292 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 19.667 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 29664.00 | 45.40 | 2.10 | 16.18 | true | 0.455670;0.452943;0.453727;0.452993;0.460729 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 26496;31488;31008;24320;32384 |
293 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 20 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 1024.00 | 3253.33 | 43.90 | 0.00 | 0.00 | true | 0.439705;0.439743;0.439528;0.439053;0.439166 | 0;0;0;0;0 | 1024;1024;5120;1024;1024 | 3328;3360;3072;4864;3072 |
294 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 17.667 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 640.00 | 43.80 | 0.00 | 0.00 | true | 0.437919;0.437124;0.437807;0.437395;0.437635 | 0;0;0;0;0 | 0;0;0;0;0 | 512;1024;640;256;768 |
295 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.00 | 285752320 | 1569376.00 | 1845984.00 | 10.00 | 83.67 | 8404.48 | false | 0.100136;0.099687;0.099838;0.099798;0.100135 | 285752320;285752320;285752320;285752320;285752320 | 1550880;1554688;1543456;1603872;1602560 | 1859296;1875232;1802176;1843936;1834720 |
295 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.67 | 1161984 | 222965.33 | 648128.00 | 2.00 | 1.33 | 69.72 | true | 0.019937;0.019935;0.019942;0.019952;0.019952 | 1161984;1161984;1161984;1161984;1161984 | 222880;222880;222880;223136;223136 | 680256;667200;634560;627648;642624 |
295 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 8085.33 | 5016896.00 | 18.20 | 3.05 | 956.93 | true | 0.183160;0.185204;0.180709;0.181289;0.179631 | 15310848;15310848;15310848;15310848;15310848 | 4987488;4983360;5048352;5026368;5036832 | 8128;8512;7872;8128;8000 |
295 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204394.67 | 814912.00 | 44.60 | 0.00 | 0.00 | true | 0.446142;0.445639;0.444873;0.446967;0.440680 | 0;0;0;0;0 | 807392;819904;807392;820000;817440 | 1204480;1204224;1204480;1204224;1204480 |
295 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.67 | 1271872 | 24448.00 | 74016.00 | 2.30 | 12.92 | 272.52 | true | 0.022992;0.022984;0.022992;0.023010;0.022973 | 1271872;1271872;1271872;1271872;1271872 | 23680;20832;24832;30912;24832 | 74752;72000;78688;68864;75296 |
296 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 40.67 | 380608512 | 6956917.33 | 2669440.00 | 11.00 | 39.54 | 9359.15 | false | 0.110353;0.110110;0.110277;0.110207;0.110110 | 380608512;380608512;380608512;380608512;380608512 | 6951712;6975136;6943904;7013536;6926368 | 2673440;2673216;2661664;2655008;2684192 |
296 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 26.67 | 20414464 | 68448.00 | 6973482.67 | 19.70 | 2.90 | 765.53 | true | 0.196303;0.197566;0.194585;0.195797;0.197493 | 20414464;20414464;20414464;20414464;20414464 | 64864;68448;66912;69984;72800 | 6945728;6953120;7008736;6991488;6975840 |
296 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605824.00 | 925653.33 | 44.80 | 0.00 | 0.00 | true | 0.445761;0.446981;0.449793;0.448508;0.448760 | 0;0;0;0;0 | 920096;924320;929312;924448;928192 | 1605824;1605824;1605824;1614016;1605824 |
296 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 714485.33 | 2.30 | 1.39 | 123.24 | true | 0.023198;0.023168;0.023201;0.023174;0.023179 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259360 | 737856;721632;715168;697920;706656 |
296 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 7.00 | 1453568 | 3488.00 | 65450.67 | 2.60 | 21.08 | 207.65 | true | 0.026203;0.026187;0.026190;0.026200;0.026215 | 1453568;1453568;1453568;1453568;1453568 | 3488;3488;3488;3488;3488 | 58240;63104;63488;72832;69760 |
297 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 27.667 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 260160.00 | 28544.00 | 46.20 | 0.22 | 16.18 | true | 0.463031;0.460660;0.458326;0.461985;0.462972 | 64736;64736;64736;64736;64736 | 260160;260160;260160;260160;260160 | 29824;27008;28544;27264;31360 |
298 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 19.667 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1344.00 | 1322.67 | 44.90 | 27.74 | 18.50 | true | 0.449115;0.448830;0.448268;0.449112;0.448272 | 73984;73984;73984;73984;73984 | 1344;1344;1344;1344;1344 | 1024;2048;896;2048;768 |
299 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 18.667 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 768.00 | 85.33 | 43.90 | 0.00 | 0.00 | true | 0.439836;0.439132;0.439236;0.439522;0.439416 | 0;0;0;0;0 | 768;768;768;768;768 | 0;384;256;0;0 |
300 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 40.00 | 333032448 | 5188224.00 | 2233610.67 | 9.90 | 44.87 | 8325.81 | false | 0.098943;0.099195;0.098729;0.098822;0.098909 | 333032448;333032448;333032448;333032448;333032448 | 5225472;5269888;5165824;5159680;5173376 | 2243904;2275808;2219168;2217248;2237760 |
300 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 17.33 | 17862656 | 30485.33 | 5961973.33 | 20.30 | 2.98 | 1030.56 | true | 0.203454;0.202773;0.202856;0.203700;0.202728 | 17862656;17862656;17862656;17862656;17862656 | 30144;30400;30912;33856;29376 | 5953280;5855744;5964736;6003360;5967904 |
300 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.67 | 1355648 | 259360.00 | 767029.33 | 2.30 | 1.32 | 116.20 | true | 0.023189;0.023160;0.023173;0.023166;0.023169 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259360 | 768928;847936;769600;734880;762560 |
300 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1404928.00 | 395818.67 | 44.50 | 0.00 | 0.00 | true | 0.446302;0.443598;0.444435;0.441123;0.446445 | 0;0;0;0;0 | 394880;401152;391424;409856;378240 | 1404928;1404928;1404928;1404928;1404928 |
300 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 195.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 1271872 | 309.33 | 27392.00 | 2.30 | 45.91 | 317.97 | false | 0.022953;0.022935;0.022953;0.022965;0.022940 | 1271872;1271872;1271872;1271872;1271872 | 256;256;256;416;416 | 28160;25984;30592;25856;28032 |
301 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 25.333 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 128.00 | 45.70 | 48.17 | 16.18 | false | 0.457268;0.457033;0.457393;0.457761;0.457893 | 64736;64736;64736;64736;64736 | 128;0;256;0;256 | 1216;1216;1216;1216;1216 |
302 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 21.333 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 96.00 | 0.00 | 44.90 | 0.00 | 0.00 | true | 0.448590;0.448709;0.448523;0.448660;0.447982 | 0;0;0;0;0 | 96;96;96;96;608 | 0;128;0;0;0 |
303 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 176 | 295936 | 1901568 | 172271360 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 87.00 | 256975104 | 5141.33 | 234144.00 | 3.10 | 1073.93 | 2953.74 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 241792;246432;230464;230176;226176 | 5056;5312;5056;5312;5056 |
303 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 176 | 295936 | 1901568 | 172271360 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605632.00 | 625386.67 | 44.90 | 0.00 | 0.00 | true | 0.447689;0.450217;0.448046;0.452956;0.449903 | 0;0;0;0;0 | 1605632;1605632;1605632;1605632;1605632 | 621952;603872;632512;633120;621696 |
304 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 25 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1280.00 | 31872.00 | 44.40 | 2.23 | 18.50 | true | 0.444046;0.443897;0.443782;0.442738;0.444367 | 73984;73984;73984;73984;73984 | 30592;33280;31744;30464;35840 | 1280;1280;1280;1280;1280 |
306 | InceptionV4/InceptionV4/Mixed_6g/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 21 | 1479936 | 0 | 172086528 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 132597.33 | 61.60 | 0.00 | 0.00 | true | 0.615885;0.615845;0.616250;0.616648;0.616624 | 0;0;0;0;0 | 256;256;256;256;512 | 132512;132512;132768;133280;132000 |
307 | InceptionV4/InceptionV4/Mixed_6h/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 52 | 1183744 | 1183744 | 173270272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 15.67 | 5692057 | 3840.00 | 1047018.67 | 53.00 | 5.42 | 363.32 | true | 0.530995;0.529869;0.530971;0.530200;0.529891 | 5692057;5692057;5692057;5692057;5692057 | 3840;3584;4096;3584;4096 | 1046048;1048384;1046176;1046496;1048832 |
308 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 143.333 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 364906.67 | 3.10 | 344.98 | 2170.42 | false | 0.031246;0.031245;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 367520;376288;351072;368064;359136 |
308 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 143.333 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786602.67 | 540266.67 | 43.60 | 0.00 | 0.00 | true | 0.424284;0.438390;0.439508;0.435631;0.432913 | 0;0;0;0;0 | 786688;786432;786688;786432;786688 | 537184;527648;547520;537632;545984 |
309 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 146 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 85.33 | 107914.67 | 3.10 | 1165.60 | 2170.42 | false | 0.031247;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;256;256;0;0 | 118976;107040;103808;107552;109152 |
309 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 146 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 111797.33 | 43.60 | 0.00 | 0.00 | true | 0.445729;0.436215;0.434201;0.431049;0.437167 | 0;0;0;0;0 | 786432;786432;786688;786432;786432 | 103328;108288;115552;114464;112640 |
310 | InceptionV4/InceptionV4/Mixed_6h/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 173.333 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 86.33 | 251769216 | 0.00 | 388384.00 | 4.70 | 648.25 | 2916.26 | false | 0.047251;0.047639;0.047582;0.047555;0.047323 | 251769216;251769216;251769216;251769216;251769216 | 0;0;0;0;0 | 395232;411648;375488;369088;394432 |
310 | InceptionV4/InceptionV4/Mixed_6h/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 173.333 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1573440.00 | 1189290.67 | 45.40 | 0.00 | 0.00 | true | 0.458074;0.458043;0.448447;0.453364;0.449626 | 0;0;0;0;0 | 1572928;1572928;1574464;1580608;1572864 | 1183200;1164896;1201280;1209184;1183392 |
311 | InceptionV4/InceptionV4/Mixed_6h/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 158 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 73.33 | 83923072 | 754261.33 | 266272.00 | 3.10 | 82.23 | 1144.41 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 751616;758016;755584;754688;752512 | 261024;268832;261408;268832;268576 |
311 | InceptionV4/InceptionV4/Mixed_6h/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 158 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 524288.00 | 201098.67 | 42.60 | 0.00 | 0.00 | true | 0.425733;0.424798;0.420638;0.432410;0.427736 | 0;0;0;0;0 | 524288;524288;524288;524288;527360 | 205952;199264;205536;198496;198208 |
312 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 25.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 222976.00 | 15018.67 | 45.70 | 0.23 | 13.87 | true | 0.455299;0.458631;0.456247;0.457338;0.456037 | 55488;55488;55488;55488;55488 | 222976;222976;222976;222592;222976 | 14976;14976;15104;15616;14976 |
313 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 19 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 207872.00 | 36746.67 | 44.70 | 0.23 | 13.87 | true | 0.446514;0.447018;0.447754;0.446730;0.446011 | 55488;55488;55488;55488;55488 | 207872;207872;207872;207872;207872 | 37248;36128;37120;36096;36992 |
314 | InceptionV4/InceptionV4/Mixed_6h/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 22 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 351978.67 | 51.80 | 0.31 | 27.74 | true | 0.519272;0.517588;0.516261;0.519930;0.516434 | 110976;110976;110976;110976;110976 | 1536;1536;1536;1536;1536 | 352864;351456;350560;351616;359520 |
315 | InceptionV4/InceptionV4/Mixed_6h/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 19 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 36992 | 512.00 | 13312.00 | 43.60 | 2.68 | 12.33 | true | 0.435064;0.435908;0.435610;0.435803;0.435916 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 15232;14208;8064;10496;15360 |
316 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 18.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 43690.67 | 43.90 | 0.00 | 0.00 | true | 0.439542;0.439524;0.439233;0.439303;0.438942 | 0;0;0;0;0 | 0;0;0;0;0 | 43648;43520;43904;43904;31744 |
317 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 18 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 85.33 | 1450.67 | 43.70 | 0.00 | 0.00 | true | 0.437340;0.436380;0.437241;0.437126;0.437123 | 0;0;0;0;0 | 256;512;0;0;0 | 1024;1664;1664;512;2176 |
318 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 33.00 | 244930560 | 14368.00 | 1046858.67 | 8.70 | 230.80 | 7422.14 | false | 0.087047;0.086867;0.086504;0.086879;0.086740 | 244930560;244930560;244930560;244930560;244930560 | 11552;16928;14368;14624;14112 | 1088352;1075648;1004448;1037184;1027744 |
318 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 2602.67 | 4060341.33 | 16.90 | 3.23 | 937.40 | true | 0.170707;0.168768;0.167955;0.168124;0.172477 | 13123584;13123584;13123584;13123584;13123584 | 2368;2624;2496;2688;6976 | 3952832;4003008;4128384;4088000;4090016 |
318 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 222325.33 | 699850.67 | 2.00 | 1.26 | 105.63 | true | 0.019988;0.019976;0.019960;0.019991;0.020005 | 1161984;1161984;1161984;1161984;1161984 | 222240;221984;222368;222368;222624 | 756800;726592;667712;684640;688320 |
318 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 291509.33 | 44.00 | 0.00 | 0.00 | true | 0.440445;0.441682;0.436880;0.440258;0.439606 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1032192 | 285344;301632;292576;282304;296608 |
318 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1090176 | 597.33 | 260672.00 | 2.00 | 4.17 | 218.04 | true | 0.019870;0.019939;0.019913;0.019855;0.019925 | 1090176;1090176;1090176;1090176;1090176 | 260224;257440;262912;264928;258880 | 5120;1280;256;256;256 |
319 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 196.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.33 | 285752320 | 2200949.33 | 1777493.33 | 10.00 | 71.83 | 8322.96 | false | 0.099850;0.099553;0.099627;0.100021;0.099590 | 285752320;285752320;285752320;285752320;285752320 | 2204960;2196192;2201696;2195936;2227296 | 1789888;1817568;1741504;1767936;1774656 |
319 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 196.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 3402.67 | 4970997.33 | 18.90 | 3.08 | 956.93 | true | 0.189590;0.189342;0.185279;0.189999;0.188058 | 15310848;15310848;15310848;15310848;15310848 | 2976;3488;4512;2848;3744 | 4917024;4902720;4996960;5002400;4999008 |
319 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 196.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 1161984 | 228597.33 | 583712.00 | 2.00 | 1.43 | 72.62 | true | 0.019956;0.019920;0.019943;0.019927;0.019948 | 1161984;1161984;1161984;1161984;1161984 | 228512;228512;228768;228512;228768 | 590464;600864;573856;575264;585408 |
319 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 196.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.33 | 0 | 1204416.00 | 1039818.67 | 45.00 | 0.00 | 0.00 | true | 0.450084;0.450032;0.448398;0.454875;0.449366 | 0;0;0;0;0 | 1204416;1204416;1204416;1204416;1204416 | 1031136;1041952;1044224;1033280;1045888 |
319 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 196.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 8.00 | 1271872 | 7178.67 | 114442.67 | 2.30 | 10.46 | 158.98 | true | 0.023067;0.023064;0.023034;0.023040;0.023047 | 1271872;1271872;1271872;1271872;1271872 | 117408;109952;119392;114336;111584 | 6944;7680;7200;7328;7008 |
320 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 26 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 223296.00 | 46837.33 | 45.80 | 0.21 | 13.87 | true | 0.458399;0.458132;0.459310;0.458058;0.456838 | 55488;55488;55488;55488;55488 | 223296;223296;223296;223296;223296 | 46272;46976;48928;47264;45984 |
321 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 22.333 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 640.00 | 45.40 | 34.88 | 16.18 | false | 0.455569;0.452543;0.454743;0.453285;0.454094 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 256;1024;640;768;512 |
322 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 19.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 1024.00 | 36437.33 | 44.00 | 0.00 | 0.00 | true | 0.440135;0.440122;0.439836;0.439835;0.439236 | 0;0;0;0;0 | 38656;37248;36992;35072;34592 | 1024;1024;1024;1024;1024 |
323 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 20.667 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 853.33 | 43.80 | 0.00 | 0.00 | true | 0.437981;0.436796;0.437481;0.437968;0.437229 | 0;0;0;0;0 | 0;0;0;0;0 | 256;384;4096;1664;512 |
324 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 34.00 | 285752320 | 1772501.33 | 1885365.33 | 10.00 | 78.12 | 8404.48 | false | 0.100056;0.100092;0.100002;0.099740;0.099696 | 285752320;285752320;285752320;285752320;285752320 | 1751872;1829312;1779392;1786240;1683904 | 1904896;1905280;1826112;1845920;1906464 |
324 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.33 | 1161984 | 222709.33 | 610986.67 | 2.00 | 1.39 | 71.14 | true | 0.019956;0.020001;0.019984;0.019989;0.019969 | 1161984;1161984;1161984;1161984;1161984 | 222624;222624;222624;222880;225184 | 644864;609152;593632;602368;621440 |
324 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 6976.00 | 4955306.67 | 18.30 | 3.09 | 956.93 | true | 0.186169;0.184226;0.182695;0.181818;0.181499 | 15310848;15310848;15310848;15310848;15310848 | 4918944;4946560;4989376;4993440;4929984 | 6976;7168;5120;6784;7168 |
324 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204394.67 | 845461.33 | 44.70 | 0.00 | 0.00 | true | 0.446298;0.444415;0.449133;0.446273;0.447156 | 0;0;0;0;0 | 840512;823360;847168;848704;848704 | 1204480;1204224;1204480;1204224;1204480 |
324 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 1271872 | 3989.33 | 100810.67 | 2.30 | 12.14 | 317.97 | true | 0.022981;0.022982;0.022979;0.022979;0.022980 | 1271872;1271872;1271872;1271872;1271872 | 101568;99648;99296;101216;102016 | 2688;3840;5248;4128;4000 |
325 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 211.333 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 40.00 | 380608512 | 7076725.33 | 2667466.67 | 11.00 | 39.06 | 9515.21 | false | 0.110391;0.110065;0.110332;0.110159;0.110467 | 380608512;380608512;380608512;380608512;380608512 | 7077408;7071264;7072672;7080096;7095328 | 2670208;2677024;2671392;2660800;2641120 |
325 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 211.333 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 27.00 | 20414464 | 98016.00 | 7167061.33 | 19.60 | 2.81 | 756.09 | true | 0.196678;0.192479;0.195687;0.198043;0.195194 | 20414464;20414464;20414464;20414464;20414464 | 98784;99872;96928;95584;98336 | 7109920;7149664;7157184;7202048;7194336 |
325 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 211.333 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605824.00 | 1030090.67 | 45.10 | 0.00 | 0.00 | true | 0.452868;0.448426;0.449698;0.451353;0.451324 | 0;0;0;0;0 | 1022592;1031232;1054976;1036448;994496 | 1605824;1605824;1605824;1605824;1605824 |
325 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 211.333 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 628949.33 | 2.30 | 1.53 | 123.24 | true | 0.023206;0.023156;0.023194;0.023193;0.023174 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259360 | 668288;635392;642016;595168;609440 |
325 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 211.333 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 7.00 | 1453568 | 3232.00 | 48128.00 | 2.60 | 28.30 | 207.65 | true | 0.026201;0.026201;0.026194;0.026185;0.026203 | 1453568;1453568;1453568;1453568;1453568 | 3232;3232;3232;3232;3232 | 38016;40192;46464;58880;57728 |
326 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 25.333 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 260160.00 | 19157.33 | 46.20 | 0.23 | 16.18 | true | 0.461314;0.464784;0.458827;0.464165;0.459604 | 64736;64736;64736;64736;64736 | 261696;260160;260160;260160;260160 | 20992;21376;18048;18432;18048 |
327 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 20 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1344.00 | 597.33 | 45.20 | 38.11 | 18.50 | false | 0.448328;0.459515;0.448743;0.460500;0.448739 | 73984;73984;73984;73984;73984 | 1344;1344;1344;1344;1344 | 768;640;384;256;768 |
328 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 20 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 768.00 | 341.33 | 44.00 | 0.00 | 0.00 | true | 0.439635;0.439449;0.439465;0.439902;0.439316 | 0;0;0;0;0 | 768;768;768;768;768 | 1280;0;896;128;0 |
329 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 40.00 | 333032448 | 5165056.00 | 2257397.33 | 9.90 | 44.87 | 8325.81 | false | 0.099075;0.098900;0.098827;0.099102;0.099096 | 333032448;333032448;333032448;333032448;333032448 | 5114752;5197440;5273216;5182976;5068672 | 2301088;2240256;2288832;2228480;2243104 |
329 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 17.00 | 17862656 | 23530.67 | 5934378.67 | 20.40 | 3.00 | 1050.74 | true | 0.204200;0.205180;0.203211;0.204036;0.203636 | 17862656;17862656;17862656;17862656;17862656 | 22080;23360;24512;24000;23232 | 5908128;5994304;5870432;5961952;5933056 |
329 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1404928.00 | 501376.00 | 44.50 | 0.00 | 0.00 | true | 0.447040;0.443234;0.445046;0.445430;0.442195 | 0;0;0;0;0 | 1404928;1415424;1404928;1404928;1404928 | 488704;509120;506304;487360;513440 |
329 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 722037.33 | 2.30 | 1.38 | 123.24 | true | 0.023169;0.023173;0.023168;0.023167;0.023156 | 1355648;1355648;1355648;1355648;1355648 | 259616;259360;259360;259360;259360 | 734656;687424;783808;714336;717120 |
329 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.00 | 1271872 | 544.00 | 16778.67 | 2.30 | 73.42 | 317.97 | false | 0.022947;0.022951;0.022951;0.022947;0.022951 | 1271872;1271872;1271872;1271872;1271872 | 16000;17568;15264;16768;19264 | 416;544;672;928;416 |
330 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 25.667 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 394.67 | 45.60 | 40.19 | 16.18 | false | 0.462619;0.456744;0.456246;0.456300;0.456417 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 544;640;256;384;128 |
331 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 19.667 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 181.33 | 42.67 | 44.90 | 0.00 | 0.00 | true | 0.449217;0.448532;0.448333;0.449442;0.448496 | 0;0;0;0;0 | 96;96;864;96;352 | 0;0;0;128;256 |
332 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 177 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 87.00 | 256975104 | 5397.33 | 157013.33 | 3.10 | 1582.26 | 2953.74 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 5312;5568;5312;5568;5312 | 164224;155136;143040;157600;158304 |
332 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 177 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605632.00 | 911061.33 | 44.80 | 0.00 | 0.00 | true | 0.449479;0.447633;0.447933;0.448532;0.447853 | 0;0;0;0;0 | 876160;916480;893824;926208;922880 | 1605632;1612288;1605632;1605632;1605632 |
333 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 30 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 73984 | 1280.00 | 12298.67 | 44.40 | 5.45 | 22.20 | true | 0.442866;0.448182;0.444586;0.443150;0.443503 | 73984;73984;73984;73984;73984 | 1280;1280;1280;1280;1280 | 12288;12320;12192;13312;12288 |
335 | InceptionV4/InceptionV4/Mixed_6h/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 22.333 | 1627648 | 0 | 172234240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 36181.33 | 61.50 | 0.00 | 0.00 | true | 0.616000;0.612486;0.617129;0.614361;0.615477 | 0;0;0;0;0 | 256;256;256;256;256 | 35968;35968;36608;36992;35328 |
336 | InceptionV4/InceptionV4/Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 1024 8 8]] | 38.667 | 262144 | 262144 | 172496384 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 6.00 | 65536 | 6144.00 | 25898.67 | 28.40 | 2.05 | 10.92 | true | 0.284338;0.284720;0.284502;0.283108;0.284546 | 65536;65536;65536;65536;65536 | 6144;6144;6656;6144;6144 | 25856;25600;28288;25728;26112 |
337 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 149.667 | 295936 | 1344512 | 172792320 | GPU_0_bfc | 1048576 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 167846144 | 256.00 | 428192.00 | 3.10 | 391.75 | 2893.90 | false | 0.031243;0.031244;0.031243;0.031244;0.031244 | 167846144;167846144;167846144;167846144;167846144 | 256;256;256;256;256 | 420256;426752;438464;428736;429088 |
337 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 149.667 | 295936 | 1344512 | 172792320 | GPU_0_bfc | 1048576 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1048746.67 | 712448.00 | 44.10 | 0.00 | 0.00 | true | 0.441135;0.441430;0.452812;0.440049;0.439176 | 0;0;0;0;0 | 1048832;1048576;1048832;1048576;1048832 | 724448;717504;702592;702400;717248 |
338 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141.333 | 221952 | 1008384 | 173014272 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.00 | 125884608 | 0.00 | 181418.67 | 3.10 | 693.89 | 2170.42 | false | 0.031246;0.031246;0.031246;0.031246;0.031247 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;9984;0 | 179328;181536;182464;181952;180768 |
338 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141.333 | 221952 | 1008384 | 173014272 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 35520.00 | 44.20 | 0.00 | 0.00 | true | 0.445957;0.441924;0.454464;0.438468;0.437831 | 0;0;0;0;0 | 36896;34336;35328;36896;31392 | 786432;786432;786432;786432;786432 |
339 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 25 | 295936 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 73984 | 1066.67 | 28629.33 | 44.60 | 2.49 | 20.18 | true | 0.442925;0.450441;0.443514;0.450345;0.444176 | 73984;73984;73984;73984;73984 | 1024;1152;1024;1152;1024 | 30208;29056;27776;25728;29056 |
340 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 21.667 | 221952 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 55488 | 768.00 | 24832.00 | 44.20 | 2.17 | 18.50 | true | 0.448652;0.441011;0.442153;0.441539;0.440194 | 55488;55488;55488;55488;55488 | 768;768;768;768;768 | 24832;23552;24832;25088;24832 |
341 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 256 17 17]] | 19 | 295936 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 554.67 | 43.80 | 0.00 | 0.00 | true | 0.438300;0.438668;0.438547;0.437834;0.438475 | 0;0;0;0;0 | 512;640;512;640;512 | 0;0;0;0;6656 |
342 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 17.333 | 221952 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.70 | 0.00 | 0.00 | true | 0.437609;0.436909;0.437729;0.437134;0.437250 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
343 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 188 | 295936 | 2130944 | 171682560 | GPU_0_bfc | 1835008 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 98.00 | 293675264 | 128.00 | 285152.00 | 3.10 | 1029.43 | 2996.69 | false | 0.031246;0.031247;0.031247;0.031246;0.031246 | 293675264;293675264;293675264;293675264;293675264 | 5248;128;128;128;128 | 286272;283392;285792;294784;282304 |
343 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 188 | 295936 | 2130944 | 171682560 | GPU_0_bfc | 1835008 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 13.67 | 0 | 1835008.00 | 833258.67 | 45.50 | 0.00 | 0.00 | true | 0.455717;0.455076;0.457324;0.453252;0.451935 | 0;0;0;0;0 | 833344;832864;833568;828608;845024 | 1835008;1835008;1835008;1835008;1835008 |
344 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 8 8]] | 182.333 | 49152 | 1376256 | 171435776 | GPU_0_bfc | 1327104 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 94.00 | 42479616 | 128.00 | 5813.33 | 3.10 | 7149.85 | 451.91 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 42479616;42479616;42479616;42479616;42479616 | 6528;4608;5120;5792;12064 | 128;128;128;2688;0 |
344 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 8 8]] | 182.333 | 49152 | 1376256 | 171435776 | GPU_0_bfc | 1327104 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1327104.00 | 106549.33 | 44.00 | 0.00 | 0.00 | true | 0.438732;0.438630;0.441992;0.440911;0.440438 | 0;0;0;0;0 | 1327104;1327104;1327104;1327104;1327104 | 110144;105856;107136;106656;103008 |
345 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 25.333 | 295936 | 0 | 171213824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 73984 | 1024.00 | 384.00 | 44.40 | 52.55 | 20.18 | false | 0.443433;0.448749;0.443404;0.444684;0.443365 | 73984;73984;73984;73984;73984 | 1024;1024;1024;1024;1024 | 384;384;384;384;384 |
346 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 8 8]] | 19.333 | 49152 | 0 | 171213824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 12288 | 768.00 | 0.00 | 45.40 | 16.00 | 4.10 | true | 0.453918;0.457550;0.455022;0.453883;0.448590 | 12288;12288;12288;12288;12288 | 0;0;0;0;0 | 768;1024;768;768;768 |
347 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 256 17 17]] | 22 | 295936 | 0 | 171213824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 554.67 | 43.80 | 0.00 | 0.00 | true | 0.437901;0.438245;0.438160;0.437986;0.438143 | 0;0;0;0;0 | 512;640;512;640;512 | 0;0;0;0;0 |
348 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 192 8 8]] | 18.333 | 49152 | 0 | 171213824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 44.10 | 0.00 | 0.00 | true | 0.440852;0.441102;0.440781;0.440505;0.440641 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
349 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 17 17]] | 243 | 468736 | 15235840 | 171682560 | GPU_0_bfc | 14767104 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 63.00 | 543303680 | 11262090.67 | 3083605.33 | 13.30 | 37.87 | 8623.87 | false | 0.133374;0.133497;0.133350;0.133041;0.133359 | 543303680;543303680;543303680;543303680;543303680 | 3117824;3072672;3076864;3095744;3078208 | 11251424;11264992;11263840;11257440;11269216 |
349 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 17 17]] | 243 | 468736 | 15235840 | 171682560 | GPU_0_bfc | 14767104 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 34.00 | 29163520 | 773738.67 | 10486645.33 | 22.40 | 2.59 | 857.75 | true | 0.223037;0.225307;0.225349;0.223278;0.222584 | 29163520;29163520;29163520;29163520;29163520 | 774848;771840;774528;778880;767744 | 10435456;10497568;10474304;10488064;10503360 |
349 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 17 17]] | 243 | 468736 | 15235840 | 171682560 | GPU_0_bfc | 14767104 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 16.00 | 0 | 2293760.00 | 651040.00 | 45.80 | 0.00 | 0.00 | true | 0.457519;0.454534;0.457338;0.458469;0.461304 | 0;0;0;0;0 | 2293760;2293760;2293760;2293760;2293760 | 628288;652160;652704;650208;650752 |
349 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 17 17]] | 243 | 468736 | 15235840 | 171682560 | GPU_0_bfc | 14767104 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1549312 | 296096.00 | 782293.33 | 2.70 | 1.44 | 140.85 | true | 0.026514;0.026485;0.026497;0.026524;0.026494 | 1549312;1549312;1549312;1549312;1549312 | 296096;296352;296096;296096;296096 | 817120;779648;792288;773600;774944 |
349 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 17 17]] | 243 | 468736 | 15235840 | 171682560 | GPU_0_bfc | 14767104 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1816960 | 160.00 | 27264.00 | 3.20 | 66.25 | 363.39 | false | 0.032387;0.032339;0.032339;0.032351;0.032374 | 1816960;1816960;1816960;1816960;1816960 | 160;160;160;160;160 | 30336;26240;29440;23552;26112 |
350 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 320 17 17]] | 25.333 | 468736 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 92480 | 1600.00 | 85.33 | 49.30 | 54.87 | 23.12 | false | 0.492798;0.492533;0.492038;0.492491;0.503633 | 92480;92480;92480;92480;92480 | 1600;1600;1600;1600;1600 | 256;0;0;0;256 |
351 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu | Relu | [[1 320 17 17]] | 18.333 | 468736 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 608.00 | 42.67 | 50.80 | 0.00 | 0.00 | true | 0.507885;0.507369;0.507433;0.507772;0.507115 | 0;0;0;0;0 | 608;608;608;608;608 | 128;0;0;0;256 |
352 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 8 8]] | 313.667 | 81920 | 3768320 | 171468544 | GPU_0_bfc | 3686400 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 208.00 | 117985280 | 1580458.67 | 634282.67 | 3.10 | 53.27 | 567.24 | false | 0.031250;0.031250;0.031250;0.031250;0.031250 | 117985280;117985280;117985280;117985280;117985280 | 1478080;1597440;1595264;1605248;1548672 | 569888;638912;643232;665792;620704 |
352 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 8 8]] | 313.667 | 81920 | 3768320 | 171468544 | GPU_0_bfc | 3686400 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 24.00 | 0 | 3693738.67 | 2204469.33 | 46.50 | 0.00 | 0.00 | true | 0.465409;0.463991;0.464750;0.466062;0.465056 | 0;0;0;0;0 | 3688000;3694080;3694720;3697920;3692416 | 2165792;2213376;2206752;2216960;2193280 |
353 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 320 8 8]] | 24 | 81920 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 20480 | 1472.00 | 27178.67 | 46.40 | 0.71 | 5.12 | true | 0.463397;0.463652;0.463405;0.464116;0.463778 | 20480;20480;20480;20480;20480 | 1472;1472;1472;1472;1472 | 30240;27296;26432;24064;27808 |
354 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu | Relu | [[1 320 8 8]] | 18 | 81920 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 42.67 | 44.20 | 0.00 | 0.00 | true | 0.441986;0.442395;0.442336;0.442061;0.442113 | 0;0;0;0;0 | 0;0;0;0;0 | 0;192;64;64;0 |
356 | InceptionV4/InceptionV4/Mixed_7b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1536 8 8]] | 40.333 | 393216 | 393216 | 171468544 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 8.00 | 1716066 | 6144.00 | 183541.33 | 40.30 | 9.05 | 214.51 | true | 0.403609;0.402609;0.403569;0.403023;0.403177 | 1716066;1716066;1716066;1716066;1716066 | 6144;5888;6400;5888;6400 | 219360;179968;178496;172384;192160 |
357 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 181 | 98304 | 2457600 | 171566848 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 75522048 | 0.00 | 204458.67 | 3.10 | 369.38 | 899.07 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 226560;210016;200448;202912;199168 | 0;0;0;0;0 |
357 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 181 | 98304 | 2457600 | 171566848 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 16.33 | 0 | 2359829.33 | 1860149.33 | 46.20 | 0.00 | 0.00 | true | 0.463909;0.454939;0.460013;0.460753;0.465618 | 0;0;0;0;0 | 2359296;2359296;2360832;2360832;2359360 | 1861888;1849344;1866240;1852320;1870176 |
358 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 169 | 98304 | 2457600 | 171665152 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 75522048 | 0.00 | 6442.67 | 3.10 | 11722.17 | 899.07 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 0;0;0;0;0 | 7040;6016;6016;6272;11648 |
358 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 169 | 98304 | 2457600 | 171665152 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 16.00 | 0 | 2359296.00 | 317866.67 | 44.20 | 0.00 | 0.00 | true | 0.441886;0.440552;0.443722;0.440548;0.442616 | 0;0;0;0;0 | 2359296;2359296;2359296;2359296;2359296 | 311680;318176;317472;318496;317952 |
359 | InceptionV4/InceptionV4/Mixed_7b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 172.667 | 114688 | 1687552 | 171779840 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 50348032 | 0.00 | 1024.00 | 3.10 | 49168.00 | 599.38 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 1152;1024;1024;1024;1024 | 0;0;0;0;0 |
359 | InceptionV4/InceptionV4/Mixed_7b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 172.667 | 114688 | 1687552 | 171779840 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 120693.33 | 43.80 | 0.00 | 0.00 | true | 0.434326;0.441082;0.442249;0.438311;0.435702 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 120928;120160;120992;121408;115616 |
360 | InceptionV4/InceptionV4/Mixed_7b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 190.333 | 65536 | 1638400 | 171376640 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 112.00 | 50348032 | 393216.00 | 313205.33 | 3.10 | 71.27 | 449.54 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 393216;393216;393216;393216;393216 | 316896;314208;308992;310784;314624 |
360 | InceptionV4/InceptionV4/Mixed_7b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 190.333 | 65536 | 1638400 | 171376640 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 244416.00 | 44.10 | 0.00 | 0.00 | true | 0.439776;0.440557;0.443019;0.440055;0.443842 | 0;0;0;0;0 | 1573120;1572864;1572864;1572864;1572864 | 239808;241856;249408;246464;244928 |
361 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 26 | 98304 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 97877.33 | 75648.00 | 46.40 | 0.14 | 6.14 | true | 0.462577;0.449833;0.464004;0.464934;0.470330 | 24576;24576;24576;24576;24576 | 75136;76288;75008;76160;75648 | 97280;97792;98048;97920;97920 |
362 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 20.667 | 98304 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 24576 | 1834.67 | 0.00 | 45.40 | 13.40 | 7.37 | true | 0.453881;0.453428;0.452920;0.453601;0.454910 | 24576;24576;24576;24576;24576 | 1536;1920;2432;1920;1664 | 0;128;0;0;0 |
363 | InceptionV4/InceptionV4/Mixed_7b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 19 | 114688 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 1024.00 | 45.50 | 8.00 | 5.46 | true | 0.455527;0.454755;0.452559;0.456257;0.455444 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 1024;1024;1024;1024;1024 |
364 | InceptionV4/InceptionV4/Mixed_7b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 21.333 | 65536 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 0.00 | 45.60 | 16.00 | 5.46 | true | 0.456461;0.455819;0.455637;0.455405;0.455273 | 16384;16384;16384;16384;16384 | 1024;1024;9216;1024;1024 | 0;0;0;128;0 |
365 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 18.667 | 98304 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 341.33 | 1066.67 | 44.10 | 0.00 | 0.00 | true | 0.440875;0.440893;0.441143;0.441084;0.441620 | 0;0;0;0;0 | 512;256;256;256;512 | 640;1536;2304;768;896 |
366 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 18.333 | 98304 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.440114;0.438821;0.439619;0.438902;0.439667 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
367 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 155.667 | 114688 | 2179072 | 171098112 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 66088960 | 0.00 | 32298.67 | 3.10 | 2046.18 | 1032.64 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 66088960;66088960;66088960;66088960;66088960 | 0;0;0;0;0 | 32256;32384;32640;32256;32128 |
367 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 155.667 | 114688 | 2179072 | 171098112 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.00 | 0 | 2064448.00 | 207904.00 | 43.60 | 0.00 | 0.00 | true | 0.437274;0.433405;0.435069;0.437376;0.436365 | 0;0;0;0;0 | 2064448;2064448;2069568;2064448;2064448 | 207840;208320;206400;207680;208192 |
368 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 151.667 | 98304 | 1277952 | 171098112 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 37765120 | 0.00 | 1013.33 | 3.10 | 37268.22 | 590.08 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 0;0;0;0;0 | 1056;928;1056;928;1184 |
368 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 151.667 | 98304 | 1277952 | 171098112 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1179648.00 | 88789.33 | 43.60 | 0.00 | 0.00 | true | 0.439109;0.435184;0.435868;0.436300;0.436993 | 0;0;0;0;0 | 1179648;1179648;1179648;1179648;1179648 | 88832;88160;88832;89056;88704 |
369 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 148.667 | 65536 | 1245184 | 171163648 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 37765120 | 0.00 | 544.00 | 3.10 | 69421.18 | 590.08 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 0;0;0;0;0 | 544;544;544;672;544 |
369 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 148.667 | 65536 | 1245184 | 171163648 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1179648.00 | 188266.67 | 44.00 | 0.00 | 0.00 | true | 0.436777;0.440640;0.439364;0.440962;0.438904 | 0;0;0;0;0 | 1179648;1179648;1179648;1179648;1179648 | 188640;188320;188160;188320;187904 |
370 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 448 8 8]] | 25.333 | 114688 | 0 | 171065344 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 28672 | 1962.67 | 256.00 | 46.10 | 12.92 | 7.82 | true | 0.462342;0.455897;0.462635;0.456772;0.463261 | 28672;28672;28672;28672;28672 | 2048;1792;2048;1792;2048 | 256;256;256;256;256 |
371 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 20.333 | 98304 | 0 | 171065344 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 0.00 | 45.50 | 16.00 | 5.46 | true | 0.455120;0.455356;0.454641;0.455907;0.455694 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 128;0;0;0;0 |
372 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 18.333 | 65536 | 0 | 171065344 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 0.00 | 45.50 | 16.00 | 5.46 | true | 0.455142;0.455354;0.455242;0.455744;0.454544 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 0;0;0;0;0 |
373 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0b_3x1/Relu | Relu | [[1 448 8 8]] | 18.333 | 114688 | 0 | 171065344 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 768.00 | 44.20 | 0.00 | 0.00 | true | 0.441997;0.441663;0.441205;0.441646;0.441467 | 0;0;0;0;0 | 0;0;0;0;0 | 768;768;768;896;768 |
374 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 167 | 131072 | 2883584 | 171196416 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 74.00 | 88113152 | 0.00 | 188405.33 | 3.10 | 467.68 | 1190.72 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 88113152;88113152;88113152;88113152;88113152 | 0;0;0;0;0 | 193568;195872;181568;183040;188608 |
374 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 167 | 131072 | 2883584 | 171196416 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 18.00 | 0 | 2752640.00 | 1010112.00 | 45.00 | 0.00 | 0.00 | true | 0.451357;0.450525;0.449068;0.448994;0.450320 | 0;0;0;0;0 | 2752640;2752640;2752640;2753408;2752640 | 988576;1001056;1019008;1010592;1018688 |
375 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 512 8 8]] | 24.667 | 131072 | 0 | 171081728 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 32768 | 2048.00 | 970.67 | 45.70 | 10.86 | 8.94 | true | 0.457074;0.459068;0.457554;0.457705;0.457072 | 32768;32768;32768;32768;32768 | 2048;2048;2048;2048;2048 | 544;544;1824;1824;544 |
376 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu | Relu | [[1 512 8 8]] | 19.667 | 131072 | 0 | 171081728 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 106.67 | 44.10 | 0.00 | 0.00 | true | 0.440566;0.440665;0.441351;0.441603;0.440993 | 0;0;0;0;0 | 64;128;0;128;384 | 0;0;0;0;0 |
377 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 169.667 | 98304 | 1671168 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 50348032 | 0.00 | 16469.33 | 3.10 | 3057.08 | 599.38 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 0;0;0;0;0 | 16960;16864;17248;14688;15584 |
377 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 169.667 | 98304 | 1671168 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 227733.33 | 44.00 | 0.00 | 0.00 | true | 0.438517;0.438383;0.436460;0.443219;0.443077 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1580288 | 226400;232992;227328;229472;224384 |
378 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 168 | 114688 | 1687552 | 171294720 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 50348032 | 0.00 | 35989.33 | 3.10 | 1398.97 | 599.38 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 0;0;0;0;0 | 38720;44928;34688;34560;34560 |
378 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 168 | 114688 | 1687552 | 171294720 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 147466.67 | 44.00 | 0.00 | 0.00 | true | 0.439211;0.438214;0.438927;0.444034;0.440889 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 149504;145408;145280;148864;148128 |
379 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 24.667 | 98304 | 0 | 171163648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 16384 | 1024.00 | 1024.00 | 45.70 | 8.00 | 4.47 | true | 0.456518;0.457197;0.457338;0.457219;0.457208 | 16384;16384;16384;16384;16384 | 8448;1024;1024;1024;1024 | 1024;1024;896;1024;1024 |
380 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 19 | 114688 | 0 | 171163648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 1365.33 | 45.50 | 6.86 | 5.46 | true | 0.451462;0.454745;0.455869;0.453847;0.455408 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 1408;1280;1408;1280;1408 |
382 | InceptionV4/InceptionV4/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1536 8 8]] | 22 | 393216 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 32170.67 | 52.50 | 0.00 | 0.00 | true | 0.524863;0.524287;0.520099;0.525488;0.525059 | 0;0;0;0;0 | 0;0;0;0;0 | 29440;32128;32384;32000;34816 |
383 | InceptionV4/InceptionV4/Mixed_7c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1536 8 8]] | 40.667 | 557056 | 557056 | 171556864 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.00 | 1645026 | 0.00 | 178144.00 | 36.20 | 9.23 | 235.00 | true | 0.362084;0.361709;0.366884;0.361510;0.362483 | 1645026;1645026;1645026;1645026;1645026 | 0;0;2048;0;0 | 188032;177792;178368;177664;178272 |
384 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 177 | 98304 | 2457600 | 171655168 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 75522048 | 1877.33 | 237930.67 | 3.10 | 314.93 | 899.07 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 1664;2304;1664;2304;1664 | 246080;234688;234976;239328;239488 |
384 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 177 | 98304 | 2457600 | 171655168 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 16.00 | 0 | 2359360.00 | 968010.67 | 44.90 | 0.00 | 0.00 | true | 0.443788;0.453317;0.445567;0.450089;0.450410 | 0;0;0;0;0 | 2359360;2363968;2359360;2359360;2359360 | 948320;964448;976992;972896;966688 |
385 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 179.667 | 98304 | 2457600 | 171753472 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 83.67 | 75522048 | 0.00 | 59957.33 | 3.10 | 1259.60 | 902.65 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 0;0;0;0;0 | 61408;59616;60000;59744;60128 |
385 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 179.667 | 98304 | 2457600 | 171753472 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 16.33 | 0 | 2359296.00 | 223264.00 | 44.30 | 0.00 | 0.00 | true | 0.443337;0.441861;0.443655;0.439470;0.443421 | 0;0;0;0;0 | 2359296;2361344;2359296;2359296;2359296 | 217632;223936;223456;223104;223232 |
386 | InceptionV4/InceptionV4/Mixed_7c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 168.333 | 65536 | 1638400 | 171819008 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 50348032 | 0.00 | 938.67 | 3.10 | 53637.80 | 599.38 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 640;1024;1024;896;896 | 0;0;0;0;0 |
386 | InceptionV4/InceptionV4/Mixed_7c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 168.333 | 65536 | 1638400 | 171819008 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 53216.00 | 43.40 | 0.00 | 0.00 | true | 0.433566;0.432125;0.436601;0.432099;0.435992 | 0;0;0;0;0 | 1572864;1577984;1572864;1572864;1572864 | 53120;53536;53024;52736;53504 |
387 | InceptionV4/InceptionV4/Mixed_7c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 196.333 | 65536 | 1638400 | 171491328 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 111.00 | 50348032 | 393216.00 | 287872.00 | 3.10 | 73.92 | 453.59 | false | 0.031249;0.031249;0.031249;0.031249;0.031250 | 50348032;50348032;50348032;50348032;50348032 | 393216;401152;393216;393216;393216 | 288512;288256;285952;288256;287104 |
387 | InceptionV4/InceptionV4/Mixed_7c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 196.333 | 65536 | 1638400 | 171491328 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 176512.00 | 44.10 | 0.00 | 0.00 | true | 0.439748;0.442474;0.440582;0.442314;0.436292 | 0;0;0;0;0 | 175232;176640;177024;175872;177024 | 1572864;1572864;1572864;1572864;1572864 |
388 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 25.333 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 100096.00 | 78816.00 | 46.40 | 0.14 | 6.14 | true | 0.463303;0.461698;0.463778;0.464634;0.463766 | 24576;24576;24576;24576;24576 | 100096;100096;105472;100096;100096 | 79104;78720;79104;78624;78336 |
389 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 19 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 24576 | 1536.00 | 0.00 | 45.30 | 16.00 | 8.19 | true | 0.454403;0.451976;0.453653;0.440715;0.454052 | 24576;24576;24576;24576;24576 | 1536;1536;1536;1536;1536 | 0;0;0;0;0 |
390 | InceptionV4/InceptionV4/Mixed_7c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 21 | 65536 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 256.00 | 45.50 | 12.80 | 5.46 | true | 0.454973;0.454237;0.454568;0.455984;0.454685 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 256;256;256;256;256 |
391 | InceptionV4/InceptionV4/Mixed_7c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 19.667 | 65536 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 3242.67 | 45.50 | 3.84 | 5.46 | true | 0.454264;0.455575;0.456266;0.453613;0.455802 | 16384;16384;16384;16384;16384 | 1024;1024;1024;6656;1024 | 1024;7424;5888;2816;1024 |
392 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 18.333 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 44.20 | 0.00 | 0.00 | true | 0.441265;0.441636;0.441508;0.441747;0.441685 | 0;0;0;0;0 | 0;0;4352;0;0 | 0;0;0;1536;0 |
393 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 18.667 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 44.00 | 0.00 | 0.00 | true | 0.439993;0.440212;0.439708;0.439492;0.440054 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
394 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 154.667 | 114688 | 2179072 | 171048960 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 66088960 | 0.00 | 2165.33 | 3.10 | 30521.38 | 1032.64 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 66088960;66088960;66088960;66088960;66088960 | 0;0;0;0;2816 | 2208;2208;2080;2208;1952 |
394 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 154.667 | 114688 | 2179072 | 171048960 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.00 | 0 | 2064448.00 | 263776.00 | 43.90 | 0.00 | 0.00 | true | 0.437199;0.438039;0.437246;0.441570;0.440676 | 0;0;0;0;0 | 266912;259488;262944;261504;266880 | 2064448;2064448;2064448;2064448;2064448 |
395 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 151.667 | 65536 | 1245184 | 171016192 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 68.00 | 37765120 | 20394.67 | 725.33 | 3.10 | 1788.12 | 555.37 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 20224;21248;22272;19712;19584 | 1664;128;256;256;1664 |
395 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 151.667 | 65536 | 1245184 | 171016192 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1179648.00 | 120746.67 | 43.70 | 0.00 | 0.00 | true | 0.437045;0.436601;0.438586;0.436252;0.437040 | 0;0;0;0;0 | 1179648;1179648;1179648;1179648;1179648 | 120736;120608;120896;121152;120448 |
396 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 143.333 | 65536 | 1245184 | 171081728 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 37765120 | 64.00 | 64.00 | 3.10 | 295040.00 | 590.08 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 64;64;64;64;192 | 64;64;64;64;64 |
396 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 143.333 | 65536 | 1245184 | 171081728 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1179648.00 | 58826.67 | 43.60 | 0.00 | 0.00 | true | 0.436059;0.434459;0.437676;0.436180;0.436130 | 0;0;0;0;0 | 1191680;1179648;1179648;1179648;1179648 | 57888;59424;59168;59424;57888 |
397 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 448 8 8]] | 25.667 | 114688 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 28672 | 2048.00 | 128.00 | 46.20 | 13.18 | 7.17 | true | 0.462042;0.462690;0.460824;0.462822;0.462627 | 28672;28672;28672;28672;28672 | 2048;2048;2048;2048;7168 | 128;128;128;128;128 |
398 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 19 | 65536 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 64.00 | 45.50 | 15.06 | 5.46 | true | 0.455368;0.454184;0.455343;0.455015;0.455191 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 64;64;64;192;64 |
399 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 21.333 | 65536 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 16384 | 1109.33 | 192.00 | 45.50 | 12.59 | 4.92 | true | 0.454549;0.454189;0.454132;0.455726;0.456918 | 16384;16384;16384;16384;16384 | 192;192;192;192;192 | 1024;2560;1280;1024;1024 |
400 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0b_3x1/Relu | Relu | [[1 448 8 8]] | 18.333 | 114688 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 44.20 | 0.00 | 0.00 | true | 0.441926;0.441632;0.441422;0.441603;0.441136 | 0;0;0;0;0 | 4096;0;0;0;0 | 0;0;0;0;128 |
401 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 167 | 131072 | 2883584 | 171114496 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 75.00 | 88113152 | 1685.33 | 81066.67 | 3.10 | 1064.79 | 1174.84 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 88113152;88113152;88113152;88113152;88113152 | 1792;1472;1792;1600;1664 | 88576;77952;85888;78848;78464 |
401 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 167 | 131072 | 2883584 | 171114496 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 18.00 | 0 | 2752512.00 | 1130901.33 | 45.40 | 0.00 | 0.00 | true | 0.452316;0.456806;0.454924;0.454687;0.452964 | 0;0;0;0;0 | 2752512;2752512;2752512;2752512;2752512 | 1109088;1147136;1127648;1127776;1137280 |
402 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 512 8 8]] | 27.333 | 131072 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 32768 | 2368.00 | 68416.00 | 46.50 | 0.46 | 8.19 | true | 0.464539;0.466038;0.463827;0.463601;0.465396 | 32768;32768;32768;32768;32768 | 66528;69280;67936;68352;68960 | 2368;2368;2368;2368;2368 |
403 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu | Relu | [[1 512 8 8]] | 18.667 | 131072 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 768.00 | 44.00 | 0.00 | 0.00 | true | 0.439801;0.440292;0.439853;0.440430;0.440415 | 0;0;0;0;0 | 0;0;0;0;0 | 768;768;896;640;768 |
404 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 173.333 | 114688 | 1687552 | 171114496 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 50348032 | 0.00 | 82528.00 | 3.10 | 610.07 | 599.38 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 0;0;0;0;0 | 85312;82560;80928;83200;81824 |
404 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 173.333 | 114688 | 1687552 | 171114496 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 264906.67 | 44.40 | 0.00 | 0.00 | true | 0.442131;0.445974;0.444097;0.443440;0.444632 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 260064;266976;271040;262656;265088 |
405 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 164.333 | 65536 | 1638400 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 50348032 | 0.00 | 18133.33 | 3.10 | 2776.55 | 599.38 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 18176;18048;18304;17792;18176 | 0;0;256;0;0 |
405 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 164.333 | 65536 | 1638400 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 127360.00 | 44.00 | 0.00 | 0.00 | true | 0.442028;0.441015;0.439585;0.436523;0.439956 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 126848;128128;125312;128256;127104 |
406 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 24.667 | 114688 | 0 | 171048960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 16384 | 1024.00 | 1109.33 | 45.80 | 7.68 | 4.92 | true | 0.457944;0.458066;0.457366;0.458063;0.456930 | 16384;16384;16384;16384;16384 | 1024;1280;1024;1024;1024 | 1152;1024;1152;1024;1152 |
407 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 19.667 | 65536 | 0 | 171048960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 16384 | 1024.00 | 42.67 | 45.30 | 15.36 | 4.92 | true | 0.452496;0.448230;0.455109;0.456094;0.452178 | 16384;16384;16384;16384;16384 | 1024;1024;6400;1024;1024 | 0;128;0;128;0 |
409 | InceptionV4/InceptionV4/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1536 8 8]] | 22.667 | 557056 | 0 | 171163648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 152853.33 | 52.40 | 0.00 | 0.00 | true | 0.524177;0.525221;0.524458;0.524202;0.520025 | 0;0;0;0;0 | 0;0;0;0;0 | 153216;156800;152640;152704;151296 |
410 | InceptionV4/InceptionV4/Mixed_7d/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1536 8 8]] | 39 | 393216 | 393216 | 171556864 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.00 | 1466766 | 0.00 | 196224.00 | 36.10 | 7.47 | 209.54 | true | 0.359899;0.361400;0.360996;0.360442;0.362090 | 1466766;1466766;1466766;1466766;1466766 | 198688;191584;193664;197920;197088 | 0;0;0;0;0 |
411 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 181 | 98304 | 2457600 | 171655168 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 75522048 | 0.00 | 174773.33 | 3.10 | 432.11 | 899.07 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 0;0;0;0;0 | 174176;176928;172576;177952;173216 |
411 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 181 | 98304 | 2457600 | 171655168 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 16.33 | 0 | 2359445.33 | 721557.33 | 44.20 | 0.00 | 0.00 | true | 0.442381;0.442038;0.441417;0.441023;0.443282 | 0;0;0;0;0 | 707040;719648;724096;720928;730944 | 2359360;2359360;2359616;2359872;2359360 |
412 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 173.333 | 98304 | 2457600 | 171753472 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 75522048 | 1024.00 | 74890.67 | 3.10 | 994.83 | 899.07 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 1024;1024;1024;1024;1024 | 75104;74464;76384;74080;75104 |
412 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 173.333 | 98304 | 2457600 | 171753472 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 16.00 | 0 | 2359296.00 | 451093.33 | 44.20 | 0.00 | 0.00 | true | 0.442879;0.440661;0.442665;0.446189;0.441903 | 0;0;0;0;0 | 2359296;2359296;2359296;2359296;2359296 | 452160;449152;449984;451136;452928 |
413 | InceptionV4/InceptionV4/Mixed_7d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 174.333 | 65536 | 1638400 | 171819008 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 83.67 | 50348032 | 0.00 | 1408.00 | 3.10 | 35758.55 | 601.77 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 0;0;0;0;0 | 1536;1408;1920;1280;1280 |
413 | InceptionV4/InceptionV4/Mixed_7d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 174.333 | 65536 | 1638400 | 171819008 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 48256.00 | 43.60 | 0.00 | 0.00 | true | 0.431997;0.436632;0.435145;0.437851;0.436028 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1573120 | 48000;48512;48512;48256;47520 |
414 | InceptionV4/InceptionV4/Mixed_7d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 191.333 | 65536 | 1638400 | 171327488 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 111.00 | 50348032 | 393216.00 | 300789.33 | 3.10 | 72.55 | 453.59 | false | 0.031249;0.031250;0.031250;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 302592;299392;301280;300800;300288 | 396160;393216;393216;393216;393216 |
414 | InceptionV4/InceptionV4/Mixed_7d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 191.333 | 65536 | 1638400 | 171327488 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 175104.00 | 44.10 | 0.00 | 0.00 | true | 0.439257;0.442192;0.439500;0.441592;0.442184 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 172416;176512;175360;173440;178688 |
415 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 25.333 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 100096.00 | 95114.67 | 46.50 | 0.13 | 6.14 | true | 0.466026;0.463126;0.464307;0.464345;0.468684 | 24576;24576;24576;24576;24576 | 100096;100096;100096;100096;100096 | 94720;95360;95136;94976;95232 |
416 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 22 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 24576 | 1536.00 | 0.00 | 45.40 | 16.00 | 8.19 | true | 0.455469;0.452969;0.454028;0.451701;0.454128 | 24576;24576;24576;24576;24576 | 1536;1536;1536;1536;1536 | 0;0;0;0;0 |
417 | InceptionV4/InceptionV4/Mixed_7d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 19 | 65536 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 1280.00 | 45.50 | 7.11 | 5.46 | true | 0.454992;0.455623;0.455681;0.457200;0.453579 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 1280;1280;1280;1280;1280 |
418 | InceptionV4/InceptionV4/Mixed_7d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 19 | 65536 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 42.67 | 45.50 | 15.36 | 5.46 | true | 0.455787;0.454970;0.452709;0.456778;0.455342 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 0;4864;0;128;0 |
419 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 18.667 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 44.20 | 0.00 | 0.00 | true | 0.442075;0.441861;0.441777;0.441800;0.441727 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
420 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 17.333 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 44.00 | 0.00 | 0.00 | true | 0.439872;0.440276;0.439969;0.440307;0.439773 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
421 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 156 | 114688 | 2179072 | 171048960 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 66088960 | 0.00 | 6720.00 | 3.10 | 9834.67 | 1032.64 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 66088960;66088960;66088960;66088960;66088960 | 7200;6944;7040;6176;6176 | 0;0;0;0;0 |
421 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 156 | 114688 | 2179072 | 171048960 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.00 | 0 | 2064448.00 | 161696.00 | 43.60 | 0.00 | 0.00 | true | 0.435373;0.435919;0.435431;0.439684;0.433822 | 0;0;0;0;0 | 2064448;2064448;2064448;2064448;2064448 | 164448;158560;161184;165664;159456 |
422 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 148.667 | 65536 | 1245184 | 171016192 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 37765120 | 128.00 | 789.33 | 3.10 | 41168.39 | 590.08 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 128;128;384;128;128 | 288;1792;288;2048;288 |
422 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 148.667 | 65536 | 1245184 | 171016192 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1179648.00 | 91338.67 | 43.50 | 0.00 | 0.00 | true | 0.434794;0.437208;0.434548;0.435641;0.435683 | 0;0;0;0;0 | 1179648;1179648;1179648;1179648;1179648 | 91296;91296;91296;91424;91808 |
423 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 147.333 | 65536 | 1245184 | 171081728 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 64.00 | 37765120 | 0.00 | 277.33 | 3.10 | 136172.47 | 590.08 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 0;0;0;0;0 | 256;288;256;288;384 |
423 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 147.333 | 65536 | 1245184 | 171081728 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1179648.00 | 73450.67 | 44.00 | 0.00 | 0.00 | true | 0.440763;0.439881;0.438879;0.439255;0.440308 | 0;0;0;0;0 | 1179648;1179648;1179648;1179648;1179648 | 74080;72384;73984;71936;73984 |
424 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 448 8 8]] | 25.667 | 114688 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 28672 | 1962.67 | 256.00 | 46.10 | 12.92 | 7.82 | true | 0.462868;0.457187;0.463051;0.457549;0.462721 | 28672;28672;28672;28672;28672 | 2048;1792;2048;1792;2048 | 256;256;256;256;256 |
425 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 22 | 65536 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 0.00 | 45.50 | 16.00 | 5.46 | true | 0.454343;0.455310;0.450671;0.455518;0.455831 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 0;0;0;0;128 |
426 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 18.667 | 65536 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 0.00 | 45.50 | 16.00 | 5.46 | true | 0.454883;0.452856;0.454790;0.455880;0.456734 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 0;0;0;0;0 |
427 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0b_3x1/Relu | Relu | [[1 448 8 8]] | 19 | 114688 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 768.00 | 44.10 | 0.00 | 0.00 | true | 0.441658;0.441411;0.441284;0.442085;0.441108 | 0;0;0;0;0 | 0;0;0;0;0 | 768;768;768;768;768 |
428 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 169.667 | 131072 | 2883584 | 171114496 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 74.00 | 88113152 | 0.00 | 41237.33 | 3.10 | 2136.73 | 1190.72 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 88113152;88113152;88113152;88113152;88113152 | 0;0;1536;0;0 | 44000;37440;42272;57344;36896 |
428 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 169.667 | 131072 | 2883584 | 171114496 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 18.00 | 0 | 2752512.00 | 1082517.33 | 45.40 | 0.00 | 0.00 | true | 0.454577;0.454482;0.454146;0.452319;0.448704 | 0;0;0;0;0 | 2752512;2752512;2752512;2756608;2752512 | 1060640;1085824;1086240;1077440;1084288 |
429 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 512 8 8]] | 24.667 | 131072 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 32768 | 2048.00 | 116480.00 | 45.70 | 0.28 | 10.92 | true | 0.457306;0.457317;0.457211;0.455882;0.456915 | 32768;32768;32768;32768;32768 | 113632;118496;116320;117728;115392 | 2048;2048;2048;2048;2048 |
430 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/Relu | Relu | [[1 512 8 8]] | 19 | 131072 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 298.67 | 44.10 | 0.00 | 0.00 | true | 0.442028;0.440892;0.441153;0.441326;0.440980 | 0;0;0;0;0 | 0;0;0;0;0 | 256;384;256;384;256 |
431 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 170.333 | 114688 | 1687552 | 171114496 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 50348032 | 1621.33 | 40373.33 | 3.10 | 1198.91 | 599.38 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 4864;0;0;0;7424 | 41632;40896;39712;40512;39456 |
431 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 170.333 | 114688 | 1687552 | 171114496 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 259285.33 | 44.50 | 0.00 | 0.00 | true | 0.444480;0.444764;0.445659;0.447318;0.444349 | 0;0;0;0;0 | 258112;260832;261760;257248;258912 | 1572864;1572864;1572864;1572864;1572864 |
432 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 168.667 | 65536 | 1638400 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 84.00 | 50348032 | 0.00 | 63658.67 | 3.10 | 790.91 | 599.38 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 63360;64128;63488;63744;63744 | 0;0;0;0;0 |
432 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 168.667 | 65536 | 1638400 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 149845.33 | 44.10 | 0.00 | 0.00 | true | 0.441836;0.442479;0.441673;0.440703;0.440102 | 0;0;0;0;0 | 150400;148224;149632;150400;149504 | 1572864;1572864;1572864;1572864;1572864 |
433 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 24 | 114688 | 0 | 171048960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.00 | 16384 | 1024.00 | 1237.33 | 45.80 | 7.25 | 5.46 | true | 0.458003;0.457583;0.458247;0.457775;0.452183 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 1152;1152;1280;1280;1280 |
434 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 19 | 65536 | 0 | 171048960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.33 | 16384 | 1024.00 | 1109.33 | 45.60 | 7.68 | 4.92 | true | 0.455109;0.456332;0.456338;0.455257;0.455815 | 16384;16384;16384;16384;16384 | 12800;1024;1024;1024;1024 | 3200;1024;1152;1024;1152 |
436 | InceptionV4/InceptionV4/Mixed_7d/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1536 8 8]] | 21.667 | 638976 | 0 | 171245568 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 89920.00 | 52.50 | 0.00 | 0.00 | true | 0.525409;0.524499;0.524719;0.525002;0.525086 | 0;0;0;0;0 | 0;0;0;0;0 | 89152;87616;92256;88448;92160 |
437 | InceptionV4/Logits/AvgPool_1a/AvgPool | AvgPool | [[1 1536 1 1]] | 41 | 6144 | 6144 | 171251712 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 8.00 | 131847 | 2816.00 | 9173.33 | 11.80 | 11.00 | 16.48 | true | 0.118387;0.118494;0.118564;0.118438;0.118529 | 131847;131847;131847;131847;131847 | 9088;9344;9216;9088;9216 | 2816;2816;2816;2816;2816 |
443 | InceptionV4/Logits/PreLogitsFlatten/Prod | Prod | [[]] | 40 | 256 | 256 | 170613248 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::functor::BlockReduceKernel<int*, int*, 256, tensorflow::functor::Prod<int> >(int*, int*, int, tensorflow::functor::Prod<int>, std::iterator_traits<int*>::value_type) | 3.00 | 0 | 3072.00 | 128.00 | 12.10 | 0.00 | 0.00 | true | 0.121050;0.121584;0.121214;0.121418;0.121634 | 0;0;0;0;0 | 3072;3072;3072;3072;3072 | 128;128;128;256;128 |
447 | InceptionV4/Logits/Logits/MatMul | MatMul | [[1 1001]] | 77 | 4096 | 4096 | 170616832 | GPU_0_bfc | 0 | 0 | 0 | 0 | void gemv2N_kernel<int, int, float, float, float, 128, 8, 4, 4, 1, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>) | 21.67 | 3202871 | 6157376.00 | 2087797.33 | 6.20 | 0.39 | 147.82 | true | 0.062490;0.062490;0.062491;0.062490;0.062490 | 3202871;3202871;3202871;3202871;3202871 | 6157376;6157376;6157376;6157376;6157440 | 2140800;2114432;2076224;2072736;2067744 |
448 | InceptionV4/Logits/Logits/BiasAdd | BiasAdd | [[1 1001]] | 25.333 | 4096 | 0 | 170610688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::BiasNHWCKernel<float>(int, float const*, float const*, float*, int) | 3.00 | 1001 | 5600.00 | 85.33 | 46.80 | 0.18 | 0.33 | true | 0.468400;0.466905;0.471154;0.467178;0.467578 | 1001;1001;1001;1001;1001 | 5600;5600;5600;5600;5600 | 128;128;128;0;0 |
449 | InceptionV4/Logits/Predictions | Softmax | [[1 1001]] | 61.333 | 4096 | 10240 | 170610688 | GPU_0_bfc | 10240 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 8.00 | 10431 | 6656.00 | 0.00 | 2.40 | 1.57 | 1.30 | true | 0.025417;0.024197;0.024059;0.023977;0.024171 | 10431;10431;10431;10431;10431 | 0;0;0;0;0 | 6656;6656;6656;6656;6656 |
449 | InceptionV4/Logits/Predictions | Softmax | [[1 1001]] | 61.333 | 4096 | 10240 | 170610688 | GPU_0_bfc | 10240 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 4.00 | 0 | 4096.00 | 0.00 | 4.20 | 0.00 | 0.00 | true | 0.042223;0.039698;0.040238;0.042366;0.042275 | 0;0;0;0;0 | 4096;4096;9216;4096;4096 | 0;0;768;0;0 |
449 | InceptionV4/Logits/Predictions | Softmax | [[1 1001]] | 61.333 | 4096 | 10240 | 170610688 | GPU_0_bfc | 10240 | 0 | 0 | 0 | void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 3.00 | 24024 | 2368.00 | 1792.00 | 6.20 | 5.78 | 8.01 | true | 0.062240;0.062253;0.062247;0.062261;0.062248 | 24024;24024;24024;24024;24024 | 1664;6272;1408;2048;1664 | 2368;9792;2368;2368;2368 |
Showing 1 to 708 of 708 entries