GPU Kernel Information
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|
layer_index | layer_name | layer_type | layer_shape | layer_duration (us) | layer_allocated_bytes | layer_peak_allocated_bytes | layer_allocator_bytes_in_use | layer_allocator_name | layer_host_temp_mem_bytes | layer_device_temp_mem_bytes | layer_host_persistent_mem_bytes | layer_device_persistent_mem_bytes | kernel_name | kernel_duration (us) | kernel_flops | kernel_dram_read_bytes | kernel_dram_write_bytes | kernel_achieved_occupancy (%) | kernel_arithmetic_intensity (flops/byte) | kernel_arithmetic_throughput (GFlops) | kernel_memory_bound | achieved_occupancy | flop_count_sp | dram_read_bytes | dram_write_bytes |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | InceptionV4/InceptionV4/Conv2d_1a_3x3/BatchNorm/batchnorm/mul-0-TransposeNHWCToNCHW-LayoutOptimizer | Transpose | [[1 3 299 299]] | 81.667 | 1072896 | 1072896 | 172752384 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::functor::SwapDimension1And2InTensor3UsingTiles<unsigned int, 1024, 1024, 2, false>(unsigned int const*, tensorflow::functor::Dimension<3>, unsigned int*) | 8.00 | 0 | 7050.67 | 74378.67 | 82.60 | 0.00 | 0.00 | true | 0.828204;0.821498;0.818761;0.828232;0.833813 | 0;0;0;0;0 | 7072;8864;7072;6944;7008 | 69344;77440;69824;75872;82784 |
2 | InceptionV4/InceptionV4/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 149 149]] | 191 | 2841856 | 2845440 | 174521344 | GPU_0_bfc | 3584 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 18.00 | 46192416 | 1305066.67 | 3483040.00 | 18.80 | 9.65 | 2566.25 | true | 0.188265;0.187867;0.187668;0.188729;0.188347 | 46192416;46192416;46192416;46192416;46192416 | 1368736;1381888;1250560;1295904;1198336 | 3588288;3576128;3395616;3477376;3334592 |
2 | InceptionV4/InceptionV4/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 149 149]] | 191 | 2841856 | 2845440 | 174521344 | GPU_0_bfc | 3584 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 3712.00 | 0.00 | 37.60 | 0.00 | 0.00 | true | 0.376404;0.376479;0.376380;0.376540;0.376504 | 0;0;0;0;0 | 3712;3712;5504;3712;3712 | 0;0;0;0;0 |
3 | InceptionV4/InceptionV4/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 32 149 149]] | 40.667 | 2841856 | 0 | 173448448 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.00 | 710432 | 1152.00 | 344800.00 | 78.90 | 2.05 | 118.41 | true | 0.787518;0.788792;0.789543;0.782694;0.791311 | 710432;710432;710432;710432;710432 | 1152;6272;1152;1152;1152 | 316320;320576;366656;347168;369184 |
4 | InceptionV4/InceptionV4/Conv2d_1a_3x3/Relu | Relu | [[1 32 149 149]] | 25.667 | 2841856 | 0 | 173448448 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 5.00 | 0 | 256.00 | 198858.67 | 68.70 | 0.00 | 0.00 | true | 0.678755;0.687956;0.689005;0.687006;0.685502 | 0;0;0;0;0 | 0;256;256;256;256 | 196256;196896;199296;207680;200384 |
5 | InceptionV4/InceptionV4/Conv2d_2a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 147 147]] | 162 | 2766080 | 2905600 | 176214528 | GPU_0_bfc | 139520 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 34.67 | 215183360 | 610197.33 | 3527968.00 | 22.60 | 52.00 | 6207.15 | false | 0.225128;0.224727;0.225743;0.225955;0.230015 | 215183360;215183360;215183360;215183360;215183360 | 658752;851744;636256;535584;527744 | 3589568;3793696;3547008;3446880;3447328 |
5 | InceptionV4/InceptionV4/Conv2d_2a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 147 147]] | 162 | 2766080 | 2905600 | 176214528 | GPU_0_bfc | 139520 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 6.00 | 59392 | 4608.00 | 37738.67 | 6.20 | 1.40 | 9.90 | true | 0.062432;0.062429;0.062433;0.062435;0.062430 | 59392;59392;59392;59392;59392 | 4608;4608;4608;4608;4608 | 37440;37824;38848;37184;37952 |
5 | InceptionV4/InceptionV4/Conv2d_2a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 32 147 147]] | 162 | 2766080 | 2905600 | 176214528 | GPU_0_bfc | 139520 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 36864.00 | 1066.67 | 42.40 | 0.00 | 0.00 | true | 0.422255;0.423376;0.423783;0.423470;0.423660 | 0;0;0;0;0 | 36864;36864;36864;36864;36864 | 1152;1024;1152;1024;1024 |
6 | InceptionV4/InceptionV4/Conv2d_2a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 32 147 147]] | 30.333 | 2766080 | 0 | 173372672 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.00 | 691488 | 256.00 | 118346.67 | 78.10 | 5.83 | 115.25 | true | 0.785772;0.782685;0.782489;0.778570;0.778092 | 691488;691488;691488;691488;691488 | 384;256;256;256;256 | 118432;116832;110208;119776;146304 |
7 | InceptionV4/InceptionV4/Conv2d_2a_3x3/Relu | Relu | [[1 32 147 147]] | 23 | 2766080 | 0 | 173372672 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 5.00 | 0 | 0.00 | 193248.00 | 67.50 | 0.00 | 0.00 | true | 0.675596;0.676357;0.674609;0.673314;0.669554 | 0;0;0;0;0 | 256;0;0;0;0 | 189696;198976;200768;191072;187552 |
8 | InceptionV4/InceptionV4/Conv2d_2b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 147 147]] | 146.667 | 5531904 | 5810688 | 178904576 | GPU_0_bfc | 278784 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 47.00 | 430366720 | 2053429.33 | 7571840.00 | 23.00 | 44.71 | 9156.74 | false | 0.231017;0.228471;0.230074;0.228923;0.229571 | 430366720;430366720;430366720;430366720;430366720 | 2082976;2065152;2100288;1976320;2012160 | 7602368;7699008;7537728;7562784;7550368 |
8 | InceptionV4/InceptionV4/Conv2d_2b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 147 147]] | 146.667 | 5531904 | 5810688 | 178904576 | GPU_0_bfc | 278784 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 74410.67 | 25525.33 | 42.10 | 0.00 | 0.00 | true | 0.420442;0.421785;0.421252;0.422052;0.420599 | 0;0;0;0;0 | 27008;24832;24736;23168;28544 | 75776;79616;73728;73728;73728 |
8 | InceptionV4/InceptionV4/Conv2d_2b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 64 147 147]] | 146.667 | 5531904 | 5810688 | 178904576 | GPU_0_bfc | 278784 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.33 | 118784 | 0.00 | 74069.33 | 6.20 | 1.60 | 35.64 | true | 0.062308;0.062306;0.062311;0.062313;0.062307 | 118784;118784;118784;118784;118784 | 0;0;0;0;0 | 73856;72320;74624;74496;73856 |
9 | InceptionV4/InceptionV4/Conv2d_2b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 64 147 147]] | 34 | 5531904 | 0 | 176138496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 10.33 | 1382976 | 1582496.00 | 3297002.67 | 83.70 | 0.28 | 133.84 | true | 0.836908;0.837558;0.836900;0.815010;0.837402 | 1382976;1382976;1382976;1382976;1382976 | 1595328;1561536;1638720;1586016;1566144 | 3298976;3252608;3409920;3272032;3320000 |
10 | InceptionV4/InceptionV4/Conv2d_2b_3x3/Relu | Relu | [[1 64 147 147]] | 26 | 5531904 | 0 | 176138496 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 8.00 | 0 | 96.00 | 4822912.00 | 74.90 | 0.00 | 0.00 | true | 0.748129;0.745266;0.758597;0.750581;0.749294 | 0;0;0;0;0 | 4832224;4823520;4782816;4812992;4843040 | 96;96;96;96;160 |
11 | InceptionV4/InceptionV4/Mixed_3a/Branch_1/Conv2d_0a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 73 73]] | 174 | 2046464 | 2267648 | 178184960 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 87.67 | 591515232 | 454933.33 | 2706378.67 | 15.50 | 187.11 | 6747.30 | false | 0.155426;0.153965;0.152674;0.155712;0.154469 | 591515232;591515232;591515232;591515232;591515232 | 613280;308736;531168;387008;446624 | 2807104;2639936;2738080;2706432;2674624 |
11 | InceptionV4/InceptionV4/Mixed_3a/Branch_1/Conv2d_0a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 73 73]] | 174 | 2046464 | 2267648 | 178184960 | GPU_0_bfc | 221184 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 552170.67 | 45.00 | 0.00 | 0.00 | true | 0.449746;0.450438;0.449706;0.450412;0.447982 | 0;0;0;0;0 | 222464;221184;221184;221184;221184 | 557248;559840;545984;553280;530368 |
12 | InceptionV4/InceptionV4/Mixed_3a/Branch_0/MaxPool_0a_3x3/MaxPool | MaxPool | [[1 64 73 73]] | 63 | 1364224 | 1364224 | 179549184 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 17.33 | 341056 | 4484821.33 | 2921664.00 | 65.70 | 0.05 | 19.68 | true | 0.653193;0.657562;0.660311;0.661389;0.652976 | 341056;341056;341056;341056;341056 | 4466336;4448352;4422560;4539776;4578432 | 2820064;2982976;2864864;2917152;2988448 |
13 | InceptionV4/InceptionV4/Mixed_3a/Branch_1/Conv2d_0a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 73 73]] | 31 | 2046464 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 7.00 | 511584 | 1293589.33 | 760138.67 | 73.30 | 0.25 | 73.08 | true | 0.731090;0.736676;0.724204;0.737825;0.731388 | 511584;511584;511584;511584;511584 | 1315360;1238976;1305344;1282144;1293280 | 759104;773024;761792;759520;725888 |
14 | InceptionV4/InceptionV4/Mixed_3a/Branch_1/Conv2d_0a_3x3/Relu | Relu | [[1 96 73 73]] | 22.333 | 2046464 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.67 | 0 | 96.00 | 348661.33 | 67.60 | 0.00 | 0.00 | true | 0.677010;0.673717;0.700686;0.676417;0.672963 | 0;0;0;0;0 | 96;96;96;96;96 | 344096;352224;348512;359808;345248 |
16 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 117.333 | 1364224 | 1405184 | 175381504 | GPU_0_bfc | 40960 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 25.00 | 109786176 | 189770.67 | 1703509.33 | 11.30 | 57.99 | 4391.45 | false | 0.113440;0.112120;0.113418;0.114691;0.112164 | 109786176;109786176;109786176;109786176;109786176 | 1685600;1713216;1702016;1708704;1699808 | 195072;189824;182656;195168;184416 |
16 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 117.333 | 1364224 | 1405184 | 175381504 | GPU_0_bfc | 40960 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 40960.00 | 87797.33 | 42.00 | 0.00 | 0.00 | true | 0.419621;0.421688;0.420772;0.418731;0.419364 | 0;0;0;0;0 | 46848;40960;40960;40960;40960 | 100672;86880;86176;88672;87840 |
17 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 111 | 2046464 | 2087424 | 177427968 | GPU_0_bfc | 40960 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 24.67 | 109786176 | 29301.33 | 1461450.67 | 11.00 | 73.64 | 4450.73 | false | 0.109414;0.110803;0.111679;0.109126;0.110101 | 109786176;109786176;109786176;109786176;109786176 | 29664;29184;24064;37312;29056 | 1453632;1465280;1456576;1465344;1462496 |
17 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 111 | 2046464 | 2087424 | 177427968 | GPU_0_bfc | 40960 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 40960.00 | 4288.00 | 41.20 | 0.00 | 0.00 | true | 0.412282;0.412513;0.413559;0.412164;0.411992 | 0;0;0;0;0 | 40960;40960;40960;40960;40960 | 4832;3168;4032;5920;4000 |
18 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 73 73]] | 29.667 | 1364224 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 341056 | 182122.67 | 873941.33 | 56.90 | 0.32 | 68.21 | true | 0.568916;0.568962;0.567833;0.571173;0.569717 | 341056;341056;341056;341056;341056 | 168640;184384;181696;180288;196992 | 848768;881728;862336;887776;877760 |
19 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 73 73]] | 23.667 | 2046464 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 341056 | 405.33 | 624800.00 | 55.70 | 0.55 | 68.21 | true | 0.551536;0.566713;0.556747;0.558706;0.555002 | 341056;341056;341056;341056;341056 | 580224;673248;653600;592192;628608 | 384;384;384;512;448 |
20 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 73 73]] | 22.333 | 1364224 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 128448.00 | 60.40 | 0.00 | 0.00 | true | 0.604883;0.603844;0.604068;0.605017;0.603549 | 0;0;0;0;0 | 119616;147808;130976;121792;132576 | 0;0;0;0;0 |
21 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 64 73 73]] | 20.333 | 2046464 | 0 | 174017280 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 50869.33 | 59.80 | 0.00 | 0.00 | true | 0.597680;0.597270;0.599345;0.598607;0.587812 | 0;0;0;0;0 | 0;0;0;0;0 | 52000;52320;47968;50016;50592 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 299.333 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 13.67 | 1458176 | 18005.33 | 280032.00 | 2.50 | 4.89 | 106.69 | true | 0.024922;0.024927;0.024924;0.024928;0.024928 | 1458176;1458176;1458176;1458176;1458176 | 280544;280288;279264;275168;281056 | 16896;18176;17920;17920;18176 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 299.333 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.33 | 1663232 | 18535.11 | 608686.22 | 2.80 | 2.65 | 134.86 | true | 0.028397;0.028405;0.028368;0.028395;0.028432;0.028366;0.028398;0.028443;0.028365;0.028395;0.028442;0.028373;0.028396;0.028420;0.028351 | 1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232 | 17568;23744;13504;17568;24032;12992;17568;28384;13120;17568;23872;17856;17568;24096;13216 | 630912;830048;328288;619520;830784;338400;631456;826912;332096;631936;828544;339712;630784;831552;328960 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 299.333 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 12.33 | 41648640 | 12401.78 | 668632.89 | 4.50 | 61.15 | 3377.01 | false | 0.044863;0.044987;0.044980;0.046165;0.046260;0.046250;0.044245;0.044405;0.044386;0.044846;0.044975;0.044988;0.042478;0.042549;0.042546 | 41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640 | 18528;14016;1856;18528;13664;2144;18528;11808;1728;18528;13088;1888;18528;17952;1792 | 790560;533440;675968;794784;532576;660544;784576;531392;679296;789568;537696;672256;793888;547072;670720 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 299.333 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.00 | 1663232 | 18535.11 | 608686.22 | 2.80 | 2.65 | 138.60 | true | 0.028397;0.028405;0.028368;0.028395;0.028432;0.028366;0.028398;0.028443;0.028365;0.028395;0.028442;0.028373;0.028396;0.028420;0.028351 | 1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232 | 17568;23744;13504;17568;24032;12992;17568;28384;13120;17568;23872;17856;17568;24096;13216 | 630912;830048;328288;619520;830784;338400;631456;826912;332096;631936;828544;339712;630784;831552;328960 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 299.333 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 12.00 | 1663232 | 18535.11 | 608686.22 | 2.80 | 2.65 | 138.60 | true | 0.028397;0.028405;0.028368;0.028395;0.028432;0.028366;0.028398;0.028443;0.028365;0.028395;0.028442;0.028373;0.028396;0.028420;0.028351 | 1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232;1663232 | 17568;23744;13504;17568;24032;12992;17568;28384;13120;17568;23872;17856;17568;24096;13216 | 630912;830048;328288;619520;830784;338400;631456;826912;332096;631936;828544;339712;630784;831552;328960 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 299.333 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 11.67 | 41648640 | 12401.78 | 668632.89 | 4.50 | 61.15 | 3569.78 | false | 0.044863;0.044987;0.044980;0.046165;0.046260;0.046250;0.044245;0.044405;0.044386;0.044846;0.044975;0.044988;0.042478;0.042549;0.042546 | 41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640 | 18528;14016;1856;18528;13664;2144;18528;11808;1728;18528;13088;1888;18528;17952;1792 | 790560;533440;675968;794784;532576;660544;784576;531392;679296;789568;537696;672256;793888;547072;670720 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 299.333 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | volta_gcgemm_32x32_nt | 11.56 | 41648640 | 12401.78 | 668632.89 | 4.50 | 61.15 | 3604.07 | false | 0.044863;0.044987;0.044980;0.046165;0.046260;0.046250;0.044245;0.044405;0.044386;0.044846;0.044975;0.044988;0.042478;0.042549;0.042546 | 41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640;41648640 | 18528;14016;1856;18528;13664;2144;18528;11808;1728;18528;13088;1888;18528;17952;1792 | 790560;533440;675968;794784;532576;660544;784576;531392;679296;789568;537696;672256;793888;547072;670720 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 299.333 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.78 | 1617550.222 | 7363.56 | 555850.67 | 2.80 | 2.87 | 238.65 | true | 0.028335;0.028095;0.028135;0.028323;0.028128;0.028150;0.028326;0.028148;0.028157;0.028340;0.028124;0.028152;0.028340;0.028116;0.028153 | 1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480 | 13856;7616;1760;15360;6304;2304;14112;7296;1472;13952;6848;1312;13792;6496;1504 | 889696;497920;387104;881536;498688;382208;887968;492576;380896;892896;491360;387552;887648;478272;385024 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 299.333 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.22 | 1617550.222 | 7363.56 | 555850.67 | 2.80 | 2.87 | 259.97 | true | 0.028335;0.028095;0.028135;0.028323;0.028128;0.028150;0.028326;0.028148;0.028157;0.028340;0.028124;0.028152;0.028340;0.028116;0.028153 | 1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480 | 889696;497920;387104;881536;498688;382208;887968;492576;380896;892896;491360;387552;887648;478272;385024 | 13856;7616;1760;15360;6304;2304;14112;7296;1472;13952;6848;1312;13792;6496;1504 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 299.333 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.22 | 1617550.222 | 7363.56 | 555850.67 | 2.80 | 2.87 | 259.97 | true | 0.028335;0.028095;0.028135;0.028323;0.028128;0.028150;0.028326;0.028148;0.028157;0.028340;0.028124;0.028152;0.028340;0.028116;0.028153 | 1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480;1625856;1625856;1588480 | 13856;7616;1760;15360;6304;2304;14112;7296;1472;13952;6848;1312;13792;6496;1504 | 889696;497920;387104;881536;498688;382208;887968;492576;380896;892896;491360;387552;887648;478272;385024 |
22 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 299.333 | 1364224 | 4578560 | 175381504 | GPU_0_bfc | 3214336 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 114858.67 | 1237.33 | 43.30 | 0.00 | 0.00 | true | 0.431576;0.431567;0.435556;0.434970;0.430384 | 0;0;0;0;0 | 114688;114688;115200;116736;114688 | 1024;1024;1408;4864;1280 |
23 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 140 | 1935872 | 3300096 | 175953152 | GPU_0_bfc | 1364224 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 35.33 | 299980800 | 1341749.33 | 1732938.67 | 19.10 | 97.56 | 8490.10 | false | 0.190889;0.188826;0.189305;0.193264;0.191831 | 299980800;299980800;299980800;299980800;299980800 | 1337184;1350208;1339584;1342080;1343584 | 1733440;1740160;1725216;1763520;1723296 |
23 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 140 | 1935872 | 3300096 | 175953152 | GPU_0_bfc | 1364224 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 5.67 | 356352 | 4032.00 | 231456.00 | 6.20 | 1.51 | 62.88 | true | 0.062427;0.062429;0.062429;0.062431;0.062431 | 356352;356352;356352;356352;356352 | 4032;3840;4032;4032;4160 | 230112;233760;233824;230496;222784 |
23 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 140 | 1935872 | 3300096 | 175953152 | GPU_0_bfc | 1364224 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221568.00 | 119754.67 | 45.10 | 0.00 | 0.00 | true | 0.448227;0.451420;0.450577;0.450094;0.451448 | 0;0;0;0;0 | 221440;221632;221440;221888;221632 | 121568;120512;117088;118464;120288 |
24 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 64 73 73]] | 29.667 | 1364224 | 0 | 173906688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.67 | 341056 | 121493.33 | 797706.67 | 61.30 | 0.37 | 60.18 | true | 0.613105;0.612910;0.610970;0.617743;0.612188 | 341056;341056;341056;341056;341056 | 119840;128480;116928;122656;121984 | 791968;814272;794848;803328;794944 |
25 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 71 71]] | 24.667 | 1935872 | 0 | 173906688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 483936 | 384.00 | 777642.67 | 68.60 | 0.62 | 96.79 | true | 0.682763;0.686874;0.686452;0.687314;0.685133 | 483936;483936;483936;483936;483936 | 802464;750592;779872;638880;829760 | 384;384;384;384;384 |
26 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 64 73 73]] | 21.667 | 1364224 | 0 | 173906688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.33 | 0 | 1024.00 | 559466.67 | 66.30 | 0.00 | 0.00 | true | 0.664002;0.652309;0.660433;0.664670;0.664240 | 0;0;0;0;0 | 1024;1024;1024;1024;1024 | 572480;567872;538048;699328;514240 |
27 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 162.333 | 1364224 | 1510912 | 175270912 | GPU_0_bfc | 146688 | 0 | 0 | 0 | volta_scudnn_128x64_relu_small_nn_v1 | 58.67 | 308969472 | 24128.00 | 1498282.67 | 6.20 | 202.95 | 5266.50 | false | 0.062493;0.062493;0.062493;0.062493;0.062493 | 308969472;308969472;308969472;308969472;308969472 | 24896;24128;24128;24128;24128 | 1488096;1496928;1499744;1500000;1498176 |
27 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 162.333 | 1364224 | 1510912 | 175270912 | GPU_0_bfc | 146688 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 114688.00 | 65802.67 | 43.30 | 0.00 | 0.00 | true | 0.433136;0.432586;0.433581;0.434360;0.433476 | 0;0;0;0;0 | 114688;114688;114688;114688;114944 | 64896;68032;64480;60992;68896 |
27 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 73 73]] | 162.333 | 1364224 | 1510912 | 175270912 | GPU_0_bfc | 146688 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 4.00 | 0 | 2560.00 | 14549.33 | 6.20 | 0.00 | 0.00 | true | 0.061925;0.061756;0.061291;0.061433;0.061316 | 0;0;0;0;0 | 14080;14336;14848;16640;14464 | 2560;2560;2560;2560;2560 |
28 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 73 73]] | 28.667 | 1364224 | 0 | 173906688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 341056 | 256.00 | 35104.00 | 54.20 | 9.65 | 68.21 | true | 0.541671;0.543909;0.539208;0.543086;0.540885 | 341056;341056;341056;341056;341056 | 256;256;256;256;256 | 34048;36192;35200;35040;35072 |
29 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_0c_7x1/Relu | Relu | [[1 64 73 73]] | 22.333 | 1364224 | 0 | 173906688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 810.67 | 60.50 | 0.00 | 0.00 | true | 0.603662;0.608658;0.604044;0.605284;0.604653 | 0;0;0;0;0 | 0;0;0;0;0 | 768;896;768;896;768 |
30 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 130.333 | 3410688 | 4246784 | 177317376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 33.00 | 299980800 | 318442.67 | 2588554.67 | 19.00 | 103.19 | 9090.33 | false | 0.190417;0.191912;0.189128;0.203778;0.184614 | 299980800;299980800;299980800;299980800;299980800 | 306080;377248;325984;302976;323264 | 2571328;2668704;2609632;2547040;2584704 |
30 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 130.333 | 3410688 | 4246784 | 177317376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 44586.67 | 44.50 | 0.00 | 0.00 | true | 0.444454;0.445999;0.444980;0.444461;0.445002 | 0;0;0;0;0 | 221184;221184;221184;221184;222464 | 44032;44160;43776;48384;45568 |
30 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 71 71]] | 130.333 | 3410688 | 4246784 | 177317376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 239776.00 | 6.20 | 1.49 | 89.09 | true | 0.062300;0.062306;0.062306;0.062302;0.062300 | 356352;356352;356352;356352;356352 | 241952;235808;242336;236832;240544 | 0;0;0;0;0 |
31 | InceptionV4/InceptionV4/Mixed_4a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 71 71]] | 28.667 | 3410688 | 0 | 175953152 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 483936 | 384.00 | 47072.00 | 69.00 | 10.20 | 96.79 | true | 0.695216;0.690995;0.692175;0.687337;0.684426 | 483936;483936;483936;483936;483936 | 384;384;384;384;384 | 64800;24032;70912;28256;48160 |
33 | InceptionV4/InceptionV4/Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 192 71 71]] | 25.333 | 3871488 | 0 | 174478080 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 6.00 | 0 | 417664.00 | 2373130.67 | 77.10 | 0.00 | 0.00 | true | 0.773080;0.774884;0.749534;0.773126;0.766435 | 0;0;0;0;0 | 432128;416640;417792;416384;418560 | 2356640;2367040;2387104;2382624;2369728 |
34 | InceptionV4/InceptionV4/Mixed_5a/Branch_1/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 192 35 35]] | 43.333 | 940800 | 940800 | 175418880 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 9.67 | 235200 | 7637.33 | 1265098.67 | 55.70 | 0.18 | 24.33 | true | 0.561625;0.557898;0.556369;0.557263;0.555458 | 235200;235200;235200;235200;235200 | 7232;7776;12352;7296;7840 | 1267104;1261248;1267104;1259744;1266944 |
35 | InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 35 35]] | 289.667 | 940800 | 2275328 | 176359680 | GPU_0_bfc | 1334528 | 0 | 0 | 0 | volta_scudnn_128x64_relu_interior_nn_v1 | 183.00 | 849838080 | 3501642.67 | 2118293.33 | 6.20 | 151.22 | 4643.92 | false | 0.062498;0.062498;0.062498;0.062498;0.062498 | 849838080;849838080;849838080;849838080;849838080 | 2124160;2114560;2118976;2113504;2121344 | 3487584;3510240;3495872;3502080;3506976 |
35 | InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 35 35]] | 289.667 | 940800 | 2275328 | 176359680 | GPU_0_bfc | 1334528 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.67 | 0 | 1327104.00 | 1267402.67 | 44.50 | 0.00 | 0.00 | true | 0.445052;0.445142;0.445367;0.447137;0.444943 | 0;0;0;0;0 | 1327104;1327104;1327104;1327104;1327104 | 1245216;1265888;1270752;1268928;1267392 |
35 | InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 35 35]] | 289.667 | 940800 | 2275328 | 176359680 | GPU_0_bfc | 1334528 | 0 | 0 | 0 | cudnn::gemm::computeOffsetsKernel(cudnn::gemm::ComputeOffsetsParams) | 3.00 | 0 | 352.00 | 3658.67 | 6.10 | 0.00 | 0.00 | true | 0.061111;0.061123;0.061104;0.061113;0.061095 | 0;0;0;0;0 | 3712;3456;3552;3712;3712 | 352;352;352;352;352 |
36 | InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 35 35]] | 31 | 940800 | 0 | 172488192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 235200 | 1088.00 | 864.00 | 59.50 | 120.49 | 47.04 | false | 0.594728;0.595183;0.594974;0.595803;0.593843 | 235200;235200;235200;235200;235200 | 1024;512;1056;1152;512 | 1088;1088;1088;1088;1088 |
37 | InceptionV4/InceptionV4/Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 192 35 35]] | 21 | 940800 | 0 | 172488192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 96.00 | 1066.67 | 68.40 | 0.00 | 0.00 | true | 0.712810;0.683811;0.683739;0.684220;0.685254 | 0;0;0;0;0 | 96;96;96;96;96 | 128;512;2176;512;2816 |
39 | InceptionV4/InceptionV4/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 384 35 35]] | 55.667 | 1881600 | 1881600 | 174369792 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 17.00 | 12156951 | 10240.00 | 1940117.33 | 62.00 | 6.23 | 715.11 | true | 0.620507;0.619287;0.620581;0.620062;0.620676 | 12156951;12156951;12156951;12156951;12156951 | 15360;10240;10240;10240;10240 | 1937344;1941216;1940800;1939904;1939648 |
40 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 119.333 | 313600 | 411904 | 174683392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 29.33 | 61420096 | 18496.00 | 372245.33 | 3.10 | 157.19 | 2093.89 | false | 0.031247;0.031247;0.031247;0.031247;0.031247 | 61420096;61420096;61420096;61420096;61420096 | 18496;19008;18496;18496;18496 | 373280;358528;370560;373504;372896 |
40 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 119.333 | 313600 | 411904 | 174683392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 98304.00 | 640.00 | 42.90 | 0.00 | 0.00 | true | 0.428451;0.428685;0.429810;0.429714;0.430178 | 0;0;0;0;0 | 98304;98304;98304;98304;98560 | 640;11520;640;640;640 |
41 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 109 | 313600 | 411904 | 174996992 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 23.00 | 61420096 | 256.00 | 323210.67 | 3.10 | 189.88 | 2670.44 | false | 0.031232;0.031232;0.031232;0.031232;0.031232 | 61420096;61420096;61420096;61420096;61420096 | 256;256;256;256;256 | 321056;322816;323488;323328;324128 |
41 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 109 | 313600 | 411904 | 174996992 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 98304.00 | 15189.33 | 42.80 | 0.00 | 0.00 | true | 0.431511;0.426750;0.427401;0.428520;0.428142 | 0;0;0;0;0 | 98304;98304;98304;98304;98304 | 14080;17152;16512;14976;14080 |
42 | InceptionV4/InceptionV4/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116.667 | 470528 | 617984 | 175467520 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 33.33 | 92130144 | 0.00 | 504842.67 | 4.50 | 182.49 | 2763.93 | false | 0.045520;0.044792;0.045040;0.045092;0.045413 | 92130144;92130144;92130144;92130144;92130144 | 0;0;0;0;0 | 503232;505248;508992;505504;503776 |
42 | InceptionV4/InceptionV4/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116.667 | 470528 | 617984 | 175467520 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147456.00 | 6314.67 | 43.50 | 0.00 | 0.00 | true | 0.433579;0.436440;0.432974;0.435710;0.434637 | 0;0;0;0;0 | 147456;147456;147456;147456;147456 | 6784;5504;6016;6528;6400 |
43 | InceptionV4/InceptionV4/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.333 | 470528 | 617984 | 174056448 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 33.67 | 92130144 | 0.00 | 457930.67 | 4.50 | 201.19 | 2736.51 | false | 0.044854;0.045586;0.045385;0.045326;0.046086 | 92130144;92130144;92130144;92130144;92130144 | 0;0;0;0;0 | 459584;452736;452768;464320;461440 |
43 | InceptionV4/InceptionV4/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.333 | 470528 | 617984 | 174056448 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 147456.00 | 15392.00 | 43.40 | 0.00 | 0.00 | true | 0.432576;0.435143;0.433389;0.434315;0.435147 | 0;0;0;0;0 | 147456;147456;147456;147456;147456 | 16032;15520;13856;14624;16032 |
44 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 28.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 1621.33 | 32394.67 | 45.20 | 2.30 | 19.60 | true | 0.454078;0.452518;0.451798;0.452341;0.452182 | 78400;78400;78400;78400;78400 | 2816;1024;6400;1024;1024 | 30592;34432;33056;32384;31744 |
45 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 21.667 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 118016.00 | 44.10 | 0.66 | 19.60 | true | 0.441013;0.440186;0.441759;0.439790;0.440508 | 78400;78400;78400;78400;78400 | 256;256;256;256;512 | 118016;115712;120320;122880;110496 |
46 | InceptionV4/InceptionV4/Mixed_5b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 22 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 209450.67 | 53.60 | 0.56 | 29.40 | true | 0.537383;0.536060;0.536980;0.534867;0.536282 | 117600;117600;117600;117600;117600 | 210464;208800;209088;204320;213664 | 384;384;384;384;384 |
47 | InceptionV4/InceptionV4/Mixed_5b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 20.667 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 35626.67 | 53.60 | 3.27 | 29.40 | true | 0.533102;0.536154;0.536778;0.536322;0.537778 | 117600;117600;117600;117600;117600 | 640;384;384;384;384 | 29312;40096;35072;33792;38016 |
48 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 22.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 0.00 | 43.80 | 0.00 | 0.00 | true | 0.437581;0.437847;0.437940;0.438474;0.437968 | 0;0;0;0;0 | 0;0;0;0;0 | 256;256;256;256;256 |
49 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 19.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 554.67 | 43.50 | 0.00 | 0.00 | true | 0.435471;0.435110;0.436065;0.435325;0.434539 | 0;0;0;0;0 | 0;0;0;0;0 | 512;640;512;640;512 |
50 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 125 | 470528 | 1411072 | 172645376 | GPU_0_bfc | 940544 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 19.33 | 99993600 | 3584.00 | 550346.67 | 12.50 | 180.52 | 5172.17 | false | 0.124657;0.124645;0.124669;0.124650;0.124666 | 99993600;99993600;99993600;99993600;99993600 | 549344;550528;556896;551168;548448 | 3840;3328;3584;3328;3840 |
50 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 125 | 470528 | 1411072 | 172645376 | GPU_0_bfc | 940544 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 4992.00 | 44.40 | 0.00 | 0.00 | true | 0.444071;0.443316;0.444071;0.442788;0.444546 | 0;0;0;0;0 | 221184;221184;221184;221184;224000 | 4736;4384;5632;4608;10112 |
50 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 125 | 470528 | 1411072 | 172645376 | GPU_0_bfc | 940544 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 1792.00 | 86613.33 | 6.20 | 4.03 | 89.09 | true | 0.062323;0.062335;0.062335;0.062343;0.062330 | 356352;356352;356352;356352;356352 | 1792;1792;1792;1792;1792 | 82816;87680;95360;89344;82816 |
51 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 117 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 19.00 | 99993600 | 0.00 | 310666.67 | 12.50 | 321.87 | 5262.82 | false | 0.124633;0.124624;0.124633;0.124610;0.124621 | 99993600;99993600;99993600;99993600;99993600 | 0;0;0;2048;0 | 310720;310592;310816;306112;310688 |
51 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 117 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 9472.00 | 44.60 | 0.00 | 0.00 | true | 0.446111;0.447709;0.447047;0.446176;0.446066 | 0;0;0;0;0 | 8832;10496;9088;8448;10880 | 221184;221184;221184;221184;221184 |
51 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 117 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 135392.00 | 6.20 | 2.63 | 89.09 | true | 0.062275;0.062262;0.062273;0.062270;0.062285 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 135648;134368;136160;136928;133600 |
52 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 27.333 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 256.00 | 54.60 | 183.75 | 29.40 | false | 0.546130;0.547460;0.545414;0.544972;0.544605 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 256;256;256;256;256 |
53 | InceptionV4/InceptionV4/Mixed_5b/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 22.333 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 640.00 | 53.70 | 114.84 | 29.40 | false | 0.533878;0.537438;0.536862;0.538296;0.537442 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 640;768;512;768;512 |
54 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 20.667 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 59.70 | 0.00 | 0.00 | true | 0.595595;0.597738;0.596394;0.597268;0.597850 | 0;0;0;0;0 | 0;1792;0;0;0 | 0;0;0;0;0 |
55 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 123.667 | 470528 | 2019328 | 172959232 | GPU_0_bfc | 1548800 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 24.33 | 149022720 | 0.00 | 438912.00 | 12.50 | 339.53 | 6124.31 | false | 0.124734;0.124726;0.124736;0.124733;0.124741 | 149022720;149022720;149022720;149022720;149022720 | 0;0;0;0;0 | 442048;438400;432448;436672;441664 |
55 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 123.667 | 470528 | 2019328 | 172959232 | GPU_0_bfc | 1548800 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 331776.00 | 469.33 | 43.90 | 0.00 | 0.00 | true | 0.438070;0.438668;0.436573;0.442813;0.438865 | 0;0;0;0;0 | 331776;331776;331776;331776;331776 | 512;384;512;384;512 |
55 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 123.667 | 470528 | 2019328 | 172959232 | GPU_0_bfc | 1548800 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 534528 | 0.00 | 48085.33 | 6.20 | 11.12 | 133.63 | true | 0.062271;0.062270;0.062270;0.062268;0.062282 | 534528;534528;534528;534528;534528 | 0;0;6656;0;0 | 45312;48128;54528;49792;46336 |
56 | InceptionV4/InceptionV4/Mixed_5b/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 27.333 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 0.00 | 54.60 | 306.25 | 29.40 | false | 0.547466;0.546903;0.546131;0.544851;0.545860 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 0;0;0;0;0 |
58 | InceptionV4/InceptionV4/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 384 35 35]] | 24.667 | 1881600 | 0 | 172488192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 405130.67 | 64.70 | 0.00 | 0.00 | true | 0.646441;0.645414;0.647738;0.647484;0.647407 | 0;0;0;0;0 | 0;10496;0;0;0 | 410016;400544;410144;401696;403680 |
59 | InceptionV4/InceptionV4/Mixed_5c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 384 35 35]] | 52.667 | 1881600 | 1881600 | 174369792 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 18.67 | 10644816 | 0.00 | 1882090.67 | 60.70 | 5.66 | 570.25 | true | 0.606616;0.605215;0.606836;0.606991;0.606100 | 10644816;10644816;10644816;10644816;10644816 | 0;0;0;0;0 | 1882176;1892672;1882560;1881536;1881472 |
60 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 108.667 | 313600 | 411904 | 174683392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 22.67 | 61420096 | 0.00 | 412778.67 | 3.10 | 148.80 | 2709.67 | false | 0.031232;0.031230;0.031229;0.031231;0.031231 | 61420096;61420096;61420096;61420096;61420096 | 413632;413120;413632;410304;411584 | 0;0;0;0;256 |
60 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 108.667 | 313600 | 411904 | 174683392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 98624.00 | 832.00 | 43.10 | 0.00 | 0.00 | true | 0.431396;0.430329;0.431544;0.428406;0.430903 | 0;0;0;0;0 | 98624;98624;98624;98624;98624 | 832;704;832;832;832 |
61 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 104.667 | 313600 | 411904 | 174996992 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 23.00 | 61420096 | 0.00 | 313994.67 | 3.10 | 195.61 | 2670.44 | false | 0.031233;0.031232;0.031231;0.031232;0.031233 | 61420096;61420096;61420096;61420096;61420096 | 0;0;0;1792;0 | 314080;313952;313952;314080;313952 |
61 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 104.667 | 313600 | 411904 | 174996992 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 98304.00 | 85.33 | 42.90 | 0.00 | 0.00 | true | 0.431615;0.429888;0.427747;0.428265;0.429016 | 0;0;0;0;0 | 98304;98304;98304;98304;98304 | 128;0;128;0;128 |
62 | InceptionV4/InceptionV4/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116 | 470528 | 617984 | 175467520 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 33.67 | 92130144 | 0.00 | 516298.67 | 4.50 | 178.44 | 2736.51 | false | 0.045267;0.045675;0.045323;0.045366;0.044943 | 92130144;92130144;92130144;92130144;92130144 | 0;0;0;0;0 | 517024;517280;510752;517280;514592 |
62 | InceptionV4/InceptionV4/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116 | 470528 | 617984 | 175467520 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147456.00 | 597.33 | 43.60 | 0.00 | 0.00 | true | 0.435472;0.437914;0.436596;0.436447;0.436149 | 0;0;0;0;0 | 147456;147456;147456;147456;147456 | 768;640;512;640;512 |
63 | InceptionV4/InceptionV4/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116 | 470528 | 617984 | 174056448 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 33.67 | 92130144 | 725.33 | 514058.67 | 4.50 | 178.97 | 2736.51 | false | 0.044841;0.046113;0.045125;0.044992;0.045918 | 92130144;92130144;92130144;92130144;92130144 | 640;640;768;768;3456 | 514912;512864;511456;519648;514400 |
63 | InceptionV4/InceptionV4/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116 | 470528 | 617984 | 174056448 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 147456.00 | 12458.67 | 43.60 | 0.00 | 0.00 | true | 0.437076;0.436482;0.435537;0.435761;0.433526 | 0;0;0;0;0 | 147456;147456;147456;147456;149504 | 12416;12288;11904;12672;13184 |
64 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 28 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 40778.67 | 44.50 | 1.91 | 19.60 | true | 0.444595;0.444369;0.445707;0.444175;0.447465 | 78400;78400;78400;78400;78400 | 5888;256;256;256;256 | 39456;38048;40864;42016;43808 |
65 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 23.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 124714.67 | 44.10 | 0.63 | 19.60 | true | 0.440942;0.440437;0.440659;0.440788;0.440054 | 78400;78400;78400;78400;78400 | 256;256;256;256;256 | 124928;123904;125312;125568;120320 |
66 | InceptionV4/InceptionV4/Mixed_5c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 21.333 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 2005.33 | 238506.67 | 53.40 | 0.49 | 29.40 | true | 0.534284;0.532904;0.534379;0.534254;0.534751 | 117600;117600;117600;117600;117600 | 239744;240768;236672;239104;231040 | 384;384;384;7040;5248 |
67 | InceptionV4/InceptionV4/Mixed_5c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 21.667 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 1408.00 | 32853.33 | 53.70 | 3.43 | 29.40 | true | 0.535696;0.538890;0.536848;0.537366;0.537202 | 117600;117600;117600;117600;117600 | 384;384;384;6016;3456 | 33920;37248;32512;28288;32128 |
68 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 21 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 42.67 | 43.80 | 0.00 | 0.00 | true | 0.437671;0.438418;0.437515;0.438141;0.438090 | 0;0;0;0;0 | 0;128;0;128;0 | 0;0;0;0;0 |
69 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 19 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 554.67 | 43.50 | 0.00 | 0.00 | true | 0.434961;0.434720;0.435774;0.434869;0.435808 | 0;0;0;0;0 | 0;0;0;0;0 | 640;512;512;512;640 |
70 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 18.67 | 99993600 | 512.00 | 327178.67 | 12.50 | 305.15 | 5356.70 | false | 0.124632;0.124634;0.124632;0.124622;0.124632 | 99993600;99993600;99993600;99993600;99993600 | 256;512;512;512;512 | 325408;329824;326304;333760;324000 |
70 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.67 | 0 | 221184.00 | 59232.00 | 45.10 | 0.00 | 0.00 | true | 0.450998;0.450383;0.452174;0.451000;0.452065 | 0;0;0;0;0 | 63136;59392;59616;58688;54016 | 221184;222016;221184;221184;221184 |
70 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 135114.67 | 6.20 | 2.64 | 89.09 | true | 0.062286;0.062284;0.062278;0.062275;0.062278 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 134784;138272;140256;130464;132288 |
71 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 18.00 | 99993600 | 0.00 | 278869.33 | 12.50 | 358.57 | 5555.20 | false | 0.124633;0.124629;0.124628;0.124629;0.124628 | 99993600;99993600;99993600;99993600;99993600 | 0;2688;0;0;0 | 283136;270976;275072;281184;280352 |
71 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 4906.67 | 44.60 | 0.00 | 0.00 | true | 0.445956;0.444897;0.446032;0.445214;0.446663 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 3840;4224;6016;5376;5120 |
71 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 179498.67 | 6.20 | 1.99 | 89.09 | true | 0.062288;0.062292;0.062294;0.062292;0.062285 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 177024;179712;191232;180128;178656 |
72 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 27 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 640.00 | 54.60 | 114.84 | 29.40 | false | 0.548279;0.544280;0.546879;0.544574;0.546417 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 512;640;256;2048;768 |
73 | InceptionV4/InceptionV4/Mixed_5c/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 21.333 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 170.67 | 53.70 | 212.02 | 29.40 | false | 0.537395;0.537828;0.534208;0.535573;0.538424 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 0;256;128;256;128 |
74 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 21 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 256.00 | 59.80 | 0.00 | 0.00 | true | 0.589294;0.598316;0.597968;0.597736;0.597762 | 0;0;0;0;0 | 0;0;0;0;0 | 512;256;256;0;256 |
75 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 120 | 470528 | 1724416 | 172959232 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 24.33 | 149022720 | 0.00 | 404618.67 | 12.50 | 368.30 | 6124.31 | false | 0.124734;0.124734;0.124733;0.124715;0.124737 | 149022720;149022720;149022720;149022720;149022720 | 0;0;0;0;0 | 401152;385184;406176;406528;406912 |
75 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 120 | 470528 | 1724416 | 172959232 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.67 | 0 | 331776.00 | 61450.67 | 43.50 | 0.00 | 0.00 | true | 0.434338;0.434465;0.435337;0.435142;0.437065 | 0;0;0;0;0 | 333568;331776;331776;331776;331776 | 61792;61312;61760;61280;61248 |
75 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 120 | 470528 | 1724416 | 172959232 | GPU_0_bfc | 1253888 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 534528 | 426.67 | 323914.67 | 6.20 | 1.65 | 133.63 | true | 0.062254;0.062241;0.062243;0.062242;0.062248 | 534528;534528;534528;534528;534528 | 2304;1280;0;0;0 | 326432;344736;322592;322720;322336 |
76 | InceptionV4/InceptionV4/Mixed_5c/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 30 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 341.33 | 54.70 | 162.13 | 29.40 | false | 0.549919;0.548472;0.547363;0.545916;0.546072 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 384;256;384;256;384 |
78 | InceptionV4/InceptionV4/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 384 35 35]] | 25.667 | 2508288 | 0 | 173114880 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.33 | 0 | 0.00 | 517781.33 | 64.80 | 0.00 | 0.00 | true | 0.647096;0.639015;0.652544;0.650552;0.646394 | 0;0;0;0;0 | 0;0;0;0;0 | 519136;516800;503040;518848;517696 |
79 | InceptionV4/InceptionV4/Mixed_5d/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 384 35 35]] | 52.667 | 1882112 | 1882112 | 174996992 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 18.67 | 10427196 | 0.00 | 1881461.33 | 61.10 | 5.54 | 558.59 | true | 0.610742;0.610694;0.610714;0.611559;0.611602 | 10427196;10427196;10427196;10427196;10427196 | 0;0;0;0;0 | 1881408;1881408;1894208;1881280;1881568 |
80 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 107.667 | 313600 | 411904 | 175310592 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 23.00 | 61420096 | 256.00 | 413600.00 | 3.10 | 148.41 | 2670.44 | false | 0.031232;0.031231;0.031231;0.031231;0.031232 | 61420096;61420096;61420096;61420096;61420096 | 256;256;256;256;256 | 413728;413056;413952;413120;413952 |
80 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 107.667 | 313600 | 411904 | 175310592 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 98304.00 | 640.00 | 42.80 | 0.00 | 0.00 | true | 0.427631;0.428403;0.427299;0.427983;0.429330 | 0;0;0;0;0 | 98304;98304;98304;98304;98304 | 640;640;640;512;640 |
81 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 107 | 313600 | 411904 | 175624192 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 22.67 | 61420096 | 0.00 | 306197.33 | 3.10 | 200.59 | 2709.67 | false | 0.031230;0.031230;0.031229;0.031229;0.031229 | 61420096;61420096;61420096;61420096;61420096 | 0;0;5376;0;0 | 305888;306272;306272;306272;306048 |
81 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 107 | 313600 | 411904 | 175624192 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 98304.00 | 384.00 | 43.10 | 0.00 | 0.00 | true | 0.431580;0.431634;0.430800;0.430522;0.431775 | 0;0;0;0;0 | 98304;98304;98304;98304;98304 | 384;384;384;384;384 |
82 | InceptionV4/InceptionV4/Mixed_5d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116.333 | 470528 | 617984 | 176094720 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 33.00 | 92130144 | 0.00 | 499818.67 | 4.50 | 184.33 | 2791.82 | false | 0.046036;0.044792;0.044971;0.044916;0.045399 | 92130144;92130144;92130144;92130144;92130144 | 6656;0;0;0;0 | 499584;501184;498528;498688;501728 |
82 | InceptionV4/InceptionV4/Mixed_5d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116.333 | 470528 | 617984 | 176094720 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 147456.00 | 2133.33 | 43.50 | 0.00 | 0.00 | true | 0.435817;0.432622;0.435549;0.435406;0.431828 | 0;0;0;0;0 | 147456;147456;147456;147456;147456 | 2432;2432;2048;1664;1920 |
83 | InceptionV4/InceptionV4/Mixed_5d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 119 | 470528 | 617984 | 174056960 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 34.00 | 92130144 | 142592.00 | 519957.33 | 4.50 | 139.05 | 2709.71 | false | 0.046007;0.045499;0.045432;0.044699;0.044656 | 92130144;92130144;92130144;92130144;92130144 | 144256;143488;140928;137728;143360 | 521376;517888;512864;522784;520608 |
83 | InceptionV4/InceptionV4/Mixed_5d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 119 | 470528 | 617984 | 174056960 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147456.00 | 13738.67 | 43.60 | 0.00 | 0.00 | true | 0.434243;0.437746;0.437312;0.436142;0.433858 | 0;0;0;0;0 | 147456;147456;147456;147456;147456 | 13824;14080;13696;13696;11776 |
84 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 28.667 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 25344.00 | 56576.00 | 44.60 | 0.96 | 19.60 | true | 0.446579;0.444091;0.446043;0.444934;0.446322 | 78400;78400;78400;78400;78400 | 27008;24960;24704;25856;25216 | 53536;54848;59168;58400;56480 |
85 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 21.667 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 78400 | 11008.00 | 105205.33 | 44.10 | 0.67 | 18.09 | true | 0.441031;0.440060;0.441879;0.441696;0.440399 | 78400;78400;78400;78400;78400 | 17664;10496;10240;11648;10880 | 115360;105376;105248;104896;104992 |
86 | InceptionV4/InceptionV4/Mixed_5d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 22 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 263093.33 | 53.40 | 0.45 | 29.40 | true | 0.533913;0.534124;0.534574;0.533523;0.536127 | 117600;117600;117600;117600;117600 | 256768;267776;277248;258816;262688 | 384;384;384;384;384 |
87 | InceptionV4/InceptionV4/Mixed_5d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 21 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 21248.00 | 53.60 | 5.44 | 29.40 | true | 0.535094;0.537868;0.534984;0.537963;0.533570 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 25856;20608;13824;24448;18688 |
88 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 21.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 0.00 | 43.80 | 0.00 | 0.00 | true | 0.437808;0.437623;0.438419;0.438287;0.438388 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;512;0;0 |
89 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 18.667 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 597.33 | 43.70 | 0.00 | 0.00 | true | 0.436744;0.436918;0.436544;0.436605;0.436444 | 0;0;0;0;0 | 0;0;0;0;0 | 640;512;640;512;640 |
90 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.667 | 470528 | 1882112 | 172645376 | GPU_0_bfc | 1411584 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 18.00 | 99993600 | 0.00 | 668170.67 | 12.50 | 149.65 | 5555.20 | false | 0.124614;0.124632;0.124640;0.124637;0.124631 | 99993600;99993600;99993600;99993600;99993600 | 256;0;0;0;0 | 666944;669504;670528;665280;668064 |
90 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.667 | 470528 | 1882112 | 172645376 | GPU_0_bfc | 1411584 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 99306.67 | 44.50 | 0.00 | 0.00 | true | 0.444063;0.445937;0.444302;0.444080;0.445985 | 0;0;0;0;0 | 221184;221184;221184;222208;221184 | 100416;97728;93248;100800;99776 |
90 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.667 | 470528 | 1882112 | 172645376 | GPU_0_bfc | 1411584 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 238656.00 | 6.20 | 1.49 | 89.09 | true | 0.062305;0.062282;0.062295;0.062292;0.062298 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 237760;240448;244160;237760;237248 |
91 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116 | 470528 | 1411584 | 172802304 | GPU_0_bfc | 941056 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 18.33 | 99993600 | 0.00 | 253397.33 | 12.50 | 394.61 | 5454.30 | false | 0.124631;0.124637;0.124623;0.124615;0.124643 | 99993600;99993600;99993600;99993600;99993600 | 0;0;6656;0;0 | 250240;254720;255232;247040;255488 |
91 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116 | 470528 | 1411584 | 172802304 | GPU_0_bfc | 941056 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 32426.67 | 44.50 | 0.00 | 0.00 | true | 0.445313;0.444443;0.444627;0.444596;0.447001 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 31104;32000;34176;30080;34304 |
91 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116 | 470528 | 1411584 | 172802304 | GPU_0_bfc | 941056 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.67 | 356352 | 0.00 | 170037.33 | 6.20 | 2.10 | 97.18 | true | 0.062276;0.062275;0.062278;0.062265;0.062273 | 356352;356352;356352;356352;356352 | 167328;172736;170144;168896;171072 | 0;0;0;0;0 |
92 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 27.333 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 0.00 | 54.70 | 306.25 | 29.40 | false | 0.546875;0.545980;0.545384;0.547820;0.547129 | 117600;117600;117600;117600;117600 | 0;0;0;0;0 | 384;384;384;384;384 |
93 | InceptionV4/InceptionV4/Mixed_5d/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 22.667 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 85.33 | 53.60 | 250.57 | 29.40 | false | 0.536867;0.535731;0.536391;0.534649;0.536913 | 117600;117600;117600;117600;117600 | 384;10880;384;384;384 | 128;0;128;0;128 |
94 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 20.667 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 59.80 | 0.00 | 0.00 | true | 0.598212;0.597435;0.598272;0.597805;0.597241 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
95 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 125.667 | 470528 | 1863168 | 172959232 | GPU_0_bfc | 1392640 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 24.67 | 149022720 | 0.00 | 409578.67 | 12.50 | 363.84 | 6041.38 | false | 0.124730;0.124726;0.124737;0.124731;0.124723 | 149022720;149022720;149022720;149022720;149022720 | 412544;412192;412512;404032;400384 | 0;0;0;0;0 |
95 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 125.667 | 470528 | 1863168 | 172959232 | GPU_0_bfc | 1392640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.67 | 0 | 331776.00 | 85856.00 | 43.40 | 0.00 | 0.00 | true | 0.433981;0.434632;0.433146;0.433952;0.433522 | 0;0;0;0;0 | 331776;331776;331776;331776;331776 | 85664;85920;86240;85984;83744 |
95 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 125.667 | 470528 | 1863168 | 172959232 | GPU_0_bfc | 1392640 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.67 | 534528 | 0.00 | 298848.00 | 6.20 | 1.79 | 145.77 | true | 0.062251;0.062258;0.062263;0.062266;0.062261 | 534528;534528;534528;534528;534528 | 0;0;0;0;0 | 298656;298400;297312;299488;310304 |
96 | InceptionV4/InceptionV4/Mixed_5d/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 27.667 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 298.67 | 54.60 | 172.27 | 29.40 | false | 0.548717;0.547050;0.545402;0.545068;0.545041 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 256;512;384;256;256 |
98 | InceptionV4/InceptionV4/Mixed_5d/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 384 35 35]] | 24 | 2664960 | 0 | 173271552 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 321589.33 | 64.60 | 0.00 | 0.00 | true | 0.644648;0.647465;0.645102;0.647597;0.646081 | 0;0;0;0;0 | 0;0;0;0;0 | 315104;320000;322560;322208;323360 |
99 | InceptionV4/InceptionV4/Mixed_5e/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 384 35 35]] | 52.333 | 2352640 | 2352640 | 175624192 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 18.33 | 10465086 | 256.00 | 1647488.00 | 61.20 | 6.35 | 570.83 | true | 0.612433;0.611743;0.611971;0.613642;0.613076 | 10465086;10465086;10465086;10465086;10465086 | 256;256;256;256;256 | 1653088;1646016;1647488;1648480;1646496 |
100 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 111.667 | 313600 | 411904 | 175937792 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 23.00 | 61420096 | 0.00 | 411562.67 | 3.10 | 149.24 | 2670.44 | false | 0.031230;0.031230;0.031229;0.031231;0.031230 | 61420096;61420096;61420096;61420096;61420096 | 0;0;256;0;0 | 411328;411648;411456;411648;411584 |
100 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 111.667 | 313600 | 411904 | 175937792 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 98304.00 | 597.33 | 43.00 | 0.00 | 0.00 | true | 0.429180;0.428859;0.429094;0.433060;0.430512 | 0;0;0;0;0 | 98304;98304;98560;98304;98304 | 768;384;640;512;640 |
101 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 105 | 313600 | 411904 | 176251392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 23.00 | 61420096 | 0.00 | 292064.00 | 3.10 | 210.30 | 2670.44 | false | 0.031232;0.031232;0.031232;0.031232;0.031231 | 61420096;61420096;61420096;61420096;61420096 | 0;0;0;0;0 | 282720;292736;292256;292000;291936 |
101 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 64 35 35]] | 105 | 313600 | 411904 | 176251392 | GPU_0_bfc | 98304 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 98304.00 | 170.67 | 43.10 | 0.00 | 0.00 | true | 0.429952;0.431429;0.429273;0.431675;0.430805 | 0;0;0;0;0 | 98304;98304;98304;98304;98304 | 1792;128;0;128;256 |
102 | InceptionV4/InceptionV4/Mixed_5e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.667 | 470528 | 617984 | 176721920 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 33.33 | 92130144 | 0.00 | 499445.33 | 4.50 | 184.46 | 2763.93 | false | 0.044923;0.045775;0.045679;0.044743;0.045171 | 92130144;92130144;92130144;92130144;92130144 | 0;0;0;0;0 | 499296;499808;499232;501856;498432 |
102 | InceptionV4/InceptionV4/Mixed_5e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 118.667 | 470528 | 617984 | 176721920 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147456.00 | 20746.67 | 43.50 | 0.00 | 0.00 | true | 0.436124;0.436186;0.433808;0.434684;0.435426 | 0;0;0;0;0 | 147456;147456;147456;147456;147456 | 20512;20480;20736;21152;20992 |
103 | InceptionV4/InceptionV4/Mixed_5e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115 | 470528 | 617984 | 174527488 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 33.33 | 92130144 | 42154.67 | 554272.00 | 4.50 | 154.47 | 2763.93 | false | 0.045083;0.044976;0.044713;0.044837;0.045037 | 92130144;92130144;92130144;92130144;92130144 | 42240;42112;42240;42112;42112 | 555040;565728;554592;551104;553184 |
103 | InceptionV4/InceptionV4/Mixed_5e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 115 | 470528 | 617984 | 174527488 | GPU_0_bfc | 147456 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 147541.33 | 2304.00 | 43.60 | 0.00 | 0.00 | true | 0.435903;0.434957;0.435445;0.436944;0.437145 | 0;0;0;0;0 | 147456;147456;147456;147712;147712 | 1920;2432;640;3712;2560 |
104 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 28.333 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 9557.33 | 10378.67 | 45.00 | 3.93 | 19.60 | true | 0.454385;0.445170;0.451961;0.444454;0.453564 | 78400;78400;78400;78400;78400 | 9856;9088;9472;9472;9728 | 10848;10720;10080;9952;10336 |
105 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 64 35 35]] | 22.667 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 78400 | 256.00 | 178378.67 | 44.10 | 0.44 | 19.60 | true | 0.441792;0.440644;0.440657;0.442256;0.440923 | 78400;78400;78400;78400;78400 | 256;256;256;256;256 | 180384;177952;180000;177184;176416 |
106 | InceptionV4/InceptionV4/Mixed_5e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 21.333 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 3.67 | 117600 | 384.00 | 270421.33 | 53.50 | 0.43 | 32.07 | true | 0.535108;0.534877;0.536150;0.537403;0.534152 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 267648;270208;276480;269568;271488 |
107 | InceptionV4/InceptionV4/Mixed_5e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 21.667 | 470528 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 47488.00 | 53.80 | 2.46 | 29.40 | true | 0.535909;0.538620;0.538255;0.537326;0.536989 | 117600;117600;117600;117600;117600 | 384;384;384;2688;384 | 45184;48256;49408;45056;49024 |
108 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 20.667 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 341.33 | 43.80 | 0.00 | 0.00 | true | 0.437830;0.437997;0.437591;0.438349;0.438467 | 0;0;0;0;0 | 0;0;0;0;0 | 384;256;384;256;384 |
109 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 64 35 35]] | 19 | 313600 | 0 | 172174848 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.70 | 0.00 | 0.00 | true | 0.436847;0.435859;0.436459;0.436921;0.436937 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;256;0;0 |
110 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116.667 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 18.67 | 99993600 | 256.00 | 431754.67 | 12.50 | 231.46 | 5356.70 | false | 0.124632;0.124631;0.124631;0.124634;0.124641 | 99993600;99993600;99993600;99993600;99993600 | 423360;434592;425408;455456;435264 | 256;256;256;256;256 |
110 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116.667 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.33 | 0 | 221184.00 | 11050.67 | 44.60 | 0.00 | 0.00 | true | 0.445892;0.445317;0.445263;0.446148;0.445305 | 0;0;0;0;0 | 221184;221184;221184;221184;221184 | 10624;8960;9344;14464;13184 |
110 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 116.667 | 470528 | 1306624 | 172645376 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 356352 | 0.00 | 134624.00 | 6.20 | 2.65 | 89.09 | true | 0.062299;0.062300;0.062294;0.062291;0.062297 | 356352;356352;356352;356352;356352 | 0;0;0;0;0 | 127136;136736;132256;135136;136480 |
111 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 121 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 18.33 | 99993600 | 0.00 | 257098.67 | 12.50 | 388.93 | 5454.30 | false | 0.124626;0.124629;0.124610;0.124616;0.124621 | 99993600;99993600;99993600;99993600;99993600 | 0;0;0;0;0 | 253216;247712;258976;259104;259360 |
111 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 121 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 5.00 | 0 | 221184.00 | 9130.67 | 44.50 | 0.00 | 0.00 | true | 0.445754;0.444728;0.444395;0.444813;0.445265 | 0;0;0;0;0 | 221184;221184;221440;221184;221184 | 11392;8704;9728;8064;8960 |
111 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 121 | 470528 | 1306624 | 172802304 | GPU_0_bfc | 836096 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 3.67 | 356352 | 0.00 | 190592.00 | 6.20 | 1.87 | 97.18 | true | 0.062310;0.062279;0.062292;0.062297;0.062288 | 356352;356352;356352;356352;356352 | 190848;198272;187392;193536;186496 | 0;6400;0;0;0 |
112 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 28 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 128.00 | 54.60 | 229.69 | 29.40 | false | 0.548928;0.544621;0.546752;0.546541;0.545971 | 117600;117600;117600;117600;117600 | 256;0;128;0;512 | 384;384;384;384;384 |
113 | InceptionV4/InceptionV4/Mixed_5e/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 21 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 128.00 | 53.70 | 229.69 | 29.40 | false | 0.536385;0.534798;0.537854;0.537638;0.535589 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 0;256;128;0;256 |
114 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0b_3x3/Relu | Relu | [[1 96 35 35]] | 22 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 42.67 | 59.70 | 0.00 | 0.00 | true | 0.596690;0.597239;0.597319;0.597266;0.596832 | 0;0;0;0;0 | 0;0;0;0;0 | 0;128;0;128;0 |
115 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 121.333 | 470528 | 1880576 | 172959232 | GPU_0_bfc | 1410048 | 0 | 0 | 0 | volta_scudnn_winograd_128x128_ldg1_ldg4_relu_tile148t_nt_v1 | 24.33 | 149022720 | 0.00 | 406752.00 | 12.50 | 366.37 | 6124.31 | false | 0.124719;0.124742;0.124724;0.124728;0.124732 | 149022720;149022720;149022720;149022720;149022720 | 0;0;0;0;0 | 408352;408224;405568;405472;406464 |
115 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 121.333 | 470528 | 1880576 | 172959232 | GPU_0_bfc | 1410048 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 331840.00 | 93952.00 | 43.60 | 0.00 | 0.00 | true | 0.435794;0.434671;0.436783;0.435969;0.435374 | 0;0;0;0;0 | 331840;331840;331840;331840;331840 | 91712;92736;97152;94080;95040 |
115 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 96 35 35]] | 121.333 | 470528 | 1880576 | 172959232 | GPU_0_bfc | 1410048 | 0 | 0 | 0 | void cudnn::winograd::generateWinogradTilesKernel<0, float, float>(cudnn::winograd::GenerateWinogradTilesParams<float, float>) | 4.00 | 534528 | 0.00 | 209525.33 | 6.20 | 2.55 | 133.63 | true | 0.062275;0.062272;0.062268;0.062265;0.062263 | 534528;534528;534528;534528;534528 | 0;0;0;0;0 | 211712;209600;206368;209568;209408 |
116 | InceptionV4/InceptionV4/Mixed_5e/Branch_2/Conv2d_0c_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 96 35 35]] | 29.333 | 470528 | 0 | 172488704 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 117600 | 384.00 | 1024.00 | 54.60 | 83.52 | 29.40 | false | 0.547583;0.546185;0.545810;0.546286;0.546621 | 117600;117600;117600;117600;117600 | 384;384;384;384;384 | 2304;1792;384;256;896 |
118 | InceptionV4/InceptionV4/Mixed_5e/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 384 35 35]] | 24.667 | 2352640 | 0 | 172959232 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 408416.00 | 64.70 | 0.00 | 0.00 | true | 0.646689;0.647204;0.647589;0.645189;0.648064 | 0;0;0;0;0 | 0;0;8448;0;0 | 407008;408064;409472;407712;409504 |
119 | InceptionV4/InceptionV4/Mixed_6a/Branch_2/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 384 17 17]] | 40 | 443904 | 443904 | 173403136 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 6.00 | 110976 | 512.00 | 444021.33 | 43.00 | 0.25 | 18.50 | true | 0.429285;0.430522;0.429630;0.431056;0.432783 | 110976;110976;110976;110976;110976 | 512;512;512;512;512 | 443584;444480;444096;443872;444096 |
120 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 35 35]] | 122 | 940800 | 1235712 | 174343936 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 36.00 | 184260288 | 256.00 | 1054976.00 | 7.70 | 174.62 | 5118.34 | false | 0.077125;0.077699;0.077455;0.077588;0.076936 | 184260288;184260288;184260288;184260288;184260288 | 256;256;256;256;256 | 1057152;1059040;1047264;1051232;1056544 |
120 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 35 35]] | 122 | 940800 | 1235712 | 174343936 | GPU_0_bfc | 294912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 4.00 | 0 | 294912.00 | 45354.67 | 45.80 | 0.00 | 0.00 | true | 0.456923;0.458954;0.458005;0.455246;0.460619 | 0;0;0;0;0 | 295168;294912;294912;294912;294912 | 45056;44288;44928;46080;46240 |
121 | InceptionV4/InceptionV4/Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 361.667 | 443904 | 5752320 | 174787840 | GPU_0_bfc | 5308416 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 248.67 | 849457536 | 5884032.00 | 2121557.33 | 4.70 | 106.11 | 3416.04 | false | 0.047193;0.047186;0.047235;0.047082;0.047105 | 849457536;849457536;849457536;849457536;849457536 | 5895168;5887744;5869184;5842432;5900096 | 2120224;2103968;2121760;2122688;2123808 |
121 | InceptionV4/InceptionV4/Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 361.667 | 443904 | 5752320 | 174787840 | GPU_0_bfc | 5308416 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 29.33 | 0 | 5455616.00 | 5419818.67 | 45.90 | 0.00 | 0.00 | true | 0.456505;0.458345;0.459793;0.461015;0.459458 | 0;0;0;0;0 | 5469568;5456960;5442944;5386176;5466944 | 5436480;5426240;5415104;5407040;5418112 |
122 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 35 35]] | 30 | 940800 | 0 | 172435200 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 6.00 | 235200 | 942144.00 | 177322.67 | 57.50 | 0.21 | 39.20 | true | 0.589023;0.575420;0.574881;0.572534;0.574511 | 235200;235200;235200;235200;235200 | 174208;184448;178048;178176;175744 | 942144;942144;942144;942144;942144 |
123 | InceptionV4/InceptionV4/Mixed_6a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 22 | 443904 | 0 | 172435200 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 110976 | 1728.00 | 2688.00 | 53.80 | 25.13 | 22.20 | false | 0.543587;0.535390;0.534164;0.547461;0.534779 | 110976;110976;110976;110976;110976 | 2688;2688;2176;2944;2688 | 3776;1728;1728;1728;1728 |
124 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 35 35]] | 21.333 | 940800 | 0 | 172435200 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 768.00 | 64.80 | 0.00 | 0.00 | true | 0.646909;0.648261;0.648866;0.648079;0.649981 | 0;0;0;0;0 | 0;0;0;0;0 | 3840;384;128;1024;896 |
125 | InceptionV4/InceptionV4/Mixed_6a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 384 17 17]] | 19 | 443904 | 0 | 172435200 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 57.40 | 0.00 | 0.00 | true | 0.573367;0.573951;0.573615;0.573582;0.574294 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
126 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 35 35]] | 234 | 1097728 | 2646016 | 173532928 | GPU_0_bfc | 1548288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 143.00 | 966406112 | 28672.00 | 1112032.00 | 9.60 | 847.20 | 6758.08 | false | 0.095908;0.095897;0.096624;0.096068;0.095984 | 966406112;966406112;966406112;966406112;966406112 | 26688;33664;32512;25888;26816 | 1109024;1116416;1116480;1110656;1104320 |
126 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 224 35 35]] | 234 | 1097728 | 2646016 | 173532928 | GPU_0_bfc | 1548288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1548288.00 | 485930.67 | 43.70 | 0.00 | 0.00 | true | 0.437766;0.437247;0.437840;0.435603;0.436393 | 0;0;0;0;0 | 465856;486208;487616;486848;484736 | 1548288;1548288;1548288;1548288;1548288 |
127 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0b_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 224 35 35]] | 28.333 | 1097728 | 0 | 172592128 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 274400 | 896.00 | 59626.67 | 54.80 | 4.53 | 58.80 | true | 0.543976;0.548509;0.545914;0.548738;0.548847 | 274400;274400;274400;274400;274400 | 896;896;896;896;896 | 61952;58208;57696;58720;62688 |
128 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_0b_3x3/Relu | Relu | [[1 224 35 35]] | 24 | 1097728 | 0 | 172592128 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 1002.67 | 62.80 | 0.00 | 0.00 | true | 0.628085;0.627896;0.628586;0.627911;0.627207 | 0;0;0;0;0 | 0;0;0;0;0 | 1088;832;960;1216;960 |
129 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 191.667 | 295936 | 2360320 | 172888064 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 100.00 | 330375424 | 192.00 | 357834.67 | 3.10 | 922.77 | 3303.75 | false | 0.031247;0.031247;0.031248;0.031247;0.031247 | 330375424;330375424;330375424;330375424;330375424 | 361344;359296;353216;358880;355328 | 192;192;320;192;192 |
129 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 191.667 | 295936 | 2360320 | 172888064 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 14.00 | 0 | 2064384.00 | 1085941.33 | 43.90 | 0.00 | 0.00 | true | 0.438356;0.437028;0.442856;0.439810;0.437944 | 0;0;0;0;0 | 2064384;2064384;2064384;2064384;2064384 | 1084480;1080608;1093312;1077696;1092736 |
130 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 28.333 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1920.00 | 25514.67 | 45.00 | 2.70 | 18.50 | true | 0.452525;0.445807;0.453568;0.444005;0.452435 | 73984;73984;73984;73984;73984 | 26112;26624;18688;24448;25984 | 1344;1024;3392;3072;1344 |
131 | InceptionV4/InceptionV4/Mixed_6a/Branch_1/Conv2d_1a_3x3/Relu | Relu | [[1 256 17 17]] | 21.667 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 853.33 | 43.80 | 0.00 | 0.00 | true | 0.437770;0.437859;0.437476;0.437616;0.437290 | 0;0;0;0;0 | 0;10752;0;0;0 | 896;640;896;768;896 |
133 | InceptionV4/InceptionV4/Mixed_6b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 48.667 | 1183744 | 1183744 | 174142976 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 13.33 | 7044412 | 512.00 | 984821.33 | 53.20 | 7.15 | 528.34 | true | 0.531513;0.531382;0.532014;0.532647;0.531175 | 7044412;7044412;7044412;7044412;7044412 | 512;512;512;512;512 | 982656;983648;989984;984256;986560 |
134 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 140.333 | 221952 | 1008384 | 174364928 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 0.00 | 388608.00 | 3.10 | 323.94 | 2375.18 | false | 0.031245;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 392416;385184;399840;376576;388224 | 0;0;0;0;0 |
134 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 140.333 | 221952 | 1008384 | 174364928 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 530517.33 | 43.40 | 0.00 | 0.00 | true | 0.432534;0.433944;0.434147;0.431759;0.435066 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 525184;533184;519264;542016;533184 |
135 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 142 | 221952 | 1008384 | 174586880 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 0.00 | 177866.67 | 3.10 | 707.75 | 2375.18 | false | 0.031247;0.031247;0.031246;0.031247;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 176832;178656;179520;178112;176320 |
135 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 142 | 221952 | 1008384 | 174586880 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 83594.67 | 44.00 | 0.00 | 0.00 | true | 0.435932;0.440573;0.440416;0.443950;0.439340 | 0;0;0;0;0 | 786432;786432;786432;789760;786432 | 86176;83200;82720;83968;83616 |
136 | InceptionV4/InceptionV4/Mixed_6b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 167.667 | 443904 | 2016768 | 175030784 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 78.33 | 251769216 | 2602.67 | 471072.00 | 4.80 | 531.52 | 3214.09 | false | 0.047454;0.047583;0.047474;0.047842;0.047824 | 251769216;251769216;251769216;251769216;251769216 | 2816;1920;2624;2688;2496 | 484608;476864;467456;459360;468896 |
136 | InceptionV4/InceptionV4/Mixed_6b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 167.667 | 443904 | 2016768 | 175030784 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.67 | 0 | 1572864.00 | 1259680.00 | 43.70 | 0.00 | 0.00 | true | 0.431378;0.438045;0.441353;0.440555;0.431090 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 1251328;1252864;1263200;1270272;1262976 |
137 | InceptionV4/InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152.333 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 69.00 | 83923072 | 1005312.00 | 240832.00 | 3.10 | 67.35 | 1216.28 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 1003520;1004032;1008640;1008384;1002752 | 238304;236544;231264;247648;248384 |
137 | InceptionV4/InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152.333 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.00 | 0 | 524458.67 | 298570.67 | 43.40 | 0.00 | 0.00 | true | 0.433955;0.437147;0.431458;0.436416;0.432041 | 0;0;0;0;0 | 299968;302816;308704;292928;292160 | 524416;524480;524480;524416;524480 |
138 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 28.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 55488 | 36864.00 | 16192.00 | 45.50 | 1.05 | 11.89 | true | 0.456457;0.453743;0.455769;0.454786;0.454520 | 55488;55488;55488;55488;55488 | 16672;16512;16160;15904;15776 | 36992;36736;36864;38784;36224 |
139 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 22 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 120192.00 | 80469.33 | 44.50 | 0.28 | 13.87 | true | 0.444704;0.444378;0.445713;0.445244;0.444455 | 55488;55488;55488;55488;55488 | 120320;120320;119936;119936;125056 | 81792;80384;80000;79488;81024 |
140 | InceptionV4/InceptionV4/Mixed_6b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 23 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 250186.67 | 51.90 | 0.44 | 27.74 | true | 0.517292;0.517992;0.518881;0.518774;0.520225 | 110976;110976;110976;110976;110976 | 1536;1536;1536;1536;1536 | 240288;252576;249504;250784;250272 |
141 | InceptionV4/InceptionV4/Mixed_6b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 20.667 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 512.00 | 41045.33 | 43.50 | 0.89 | 9.25 | true | 0.434430;0.435333;0.436282;0.434990;0.434073 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 42752;40192;43520;40192;32000 |
142 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 64810.67 | 43.90 | 0.00 | 0.00 | true | 0.438813;0.439235;0.438652;0.439213;0.439263 | 0;0;0;0;0 | 0;0;0;0;14080 | 65152;65408;64384;64640;64640 |
143 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 19.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 170.67 | 43.60 | 0.00 | 0.00 | true | 0.436089;0.436149;0.436253;0.436363;0.435755 | 0;0;0;0;0 | 0;0;0;0;0 | 896;384;0;128;0 |
144 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 209 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 35.00 | 244930560 | 138677.33 | 1141994.67 | 8.40 | 191.25 | 6998.02 | false | 0.083988;0.083987;0.084375;0.083926;0.084148 | 244930560;244930560;244930560;244930560;244930560 | 135648;136928;140640;138464;145760 | 1140128;1149568;1136288;1150496;1117376 |
144 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 209 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 18.00 | 13123584 | 20842.67 | 3735722.67 | 19.70 | 3.49 | 729.09 | true | 0.196722;0.196995;0.197055;0.196542;0.196426 | 13123584;13123584;13123584;13123584;13123584 | 21056;20800;20800;20864;20864 | 3758528;3719168;3725184;3732480;3749504 |
144 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 209 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 14.33 | 1161984 | 233248.00 | 703040.00 | 2.00 | 1.24 | 81.07 | true | 0.019894;0.019894;0.019894;0.019894;0.019895 | 1161984;1161984;1161984;1161984;1161984 | 233248;232864;233120;233376;233376 | 696000;701728;711392;695104;712576 |
144 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 209 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 301696.00 | 43.40 | 0.00 | 0.00 | true | 0.432183;0.433819;0.434613;0.434057;0.433325 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1032192 | 305536;300576;290016;298976;306656 |
144 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 209 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 8.67 | 1090176 | 18496.00 | 338197.33 | 2.00 | 3.06 | 125.78 | true | 0.019870;0.019872;0.019871;0.019870;0.019869 | 1090176;1090176;1090176;1090176;1090176 | 340160;338112;339712;336768;333984 | 18560;18432;17920;18752;18496 |
145 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 204.667 | 259072 | 8274688 | 171901440 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 1989962.67 | 1654858.67 | 10.00 | 78.40 | 9217.82 | false | 0.099718;0.099959;0.100222;0.099947;0.099771 | 285752320;285752320;285752320;285752320;285752320 | 2002464;1974176;1968288;2010656;1993248 | 1654720;1673728;1658048;1651808;1633856 |
145 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 204.667 | 259072 | 8274688 | 171901440 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 18.67 | 1161984 | 243872.00 | 666432.00 | 2.00 | 1.28 | 62.25 | true | 0.019902;0.019902;0.019902;0.019903;0.019902 | 1161984;1161984;1161984;1161984;1161984 | 243872;248992;243872;243872;243872 | 648736;682784;665024;658976;675296 |
145 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 204.667 | 259072 | 8274688 | 171901440 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.67 | 15310848 | 7178.67 | 4882272.00 | 18.30 | 3.13 | 977.27 | true | 0.180838;0.183345;0.184998;0.185238;0.181079 | 15310848;15310848;15310848;15310848;15310848 | 9440;6880;6624;7136;7520 | 4874336;4841536;4896672;4898528;4875808 |
145 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 204.667 | 259072 | 8274688 | 171901440 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.67 | 0 | 1204416.00 | 1069706.67 | 44.30 | 0.00 | 0.00 | true | 0.440852;0.440411;0.444377;0.444170;0.442656 | 0;0;0;0;0 | 1083424;1088192;1064544;1041632;1061152 | 1204672;1204416;1204416;1204416;1204416 |
145 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 204.667 | 259072 | 8274688 | 171901440 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 9.67 | 1271872 | 14421.33 | 94197.33 | 2.30 | 11.71 | 131.57 | true | 0.023152;0.023149;0.023155;0.023153;0.023151 | 1271872;1271872;1271872;1271872;1271872 | 17792;12800;15360;15104;12800 | 91552;90944;92736;99456;98304 |
146 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 29.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 55488 | 226624.00 | 41493.33 | 46.00 | 0.21 | 11.10 | true | 0.459358;0.463216;0.458322;0.462062;0.458397 | 55488;55488;55488;55488;55488 | 41152;41600;44096;39360;41728 | 226624;227136;226624;226624;226624 |
147 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 24.667 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 938.67 | 45.40 | 30.04 | 16.18 | false | 0.454101;0.455521;0.454053;0.453415;0.453187 | 64736;64736;64736;64736;64736 | 1664;768;896;1152;640 | 1472;1216;1216;1216;1216 |
148 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 22 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 256.00 | 53141.33 | 45.50 | 0.00 | 0.00 | true | 0.455306;0.454455;0.455146;0.456669;0.455371 | 0;0;0;0;0 | 256;256;256;256;256 | 50720;54304;50528;54400;54912 |
149 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 22 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 170.67 | 43.80 | 0.00 | 0.00 | true | 0.437963;0.438004;0.437681;0.437577;0.437918 | 0;0;0;0;0 | 0;0;0;0;0 | 256;0;0;256;384 |
150 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8274688 | 171938560 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 1550624.00 | 1757120.00 | 10.00 | 86.39 | 9217.82 | false | 0.099946;0.100428;0.100072;0.100165;0.099995 | 285752320;285752320;285752320;285752320;285752320 | 1756800;1761952;1769024;1752608;1749344 | 1589024;1534496;1510048;1547168;1570208 |
150 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8274688 | 171938560 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 4544.00 | 5021632.00 | 17.70 | 3.05 | 956.93 | true | 0.178269;0.176901;0.176934;0.176179;0.176331 | 15310848;15310848;15310848;15310848;15310848 | 5042272;5018272;4988960;5036960;5009664 | 4928;4672;4416;3776;4544 |
150 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8274688 | 171938560 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 222368.00 | 635573.33 | 2.00 | 1.35 | 77.47 | true | 0.019986;0.019990;0.019986;0.019977;0.019983 | 1161984;1161984;1161984;1161984;1161984 | 222368;222368;222368;222368;222368 | 634912;637120;644064;628256;634688 |
150 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8274688 | 171938560 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204480.00 | 879978.67 | 44.20 | 0.00 | 0.00 | true | 0.440199;0.440765;0.448735;0.441174;0.442620 | 0;0;0;0;0 | 1204480;1204480;1204480;1204480;1204480 | 874752;871840;892896;872288;910304 |
150 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8274688 | 171938560 | GPU_0_bfc | 8015616 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 629.33 | 75274.67 | 2.30 | 16.76 | 254.37 | true | 0.023016;0.022999;0.022998;0.022998;0.023001 | 1271872;1271872;1271872;1271872;1271872 | 72736;76256;71776;79200;76832 | 800;544;416;544;2336 |
151 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 207 | 295936 | 12001280 | 172012544 | GPU_0_bfc | 11705344 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 36.00 | 380608512 | 7013418.67 | 2727797.33 | 11.00 | 39.07 | 10572.46 | false | 0.110769;0.110299;0.110516;0.110475;0.110399 | 380608512;380608512;380608512;380608512;380608512 | 7029632;6991232;6967552;7019392;7068416 | 2716864;2723584;2742944;2709600;2753536 |
151 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 207 | 295936 | 12001280 | 172012544 | GPU_0_bfc | 11705344 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 24.67 | 20414464 | 87050.67 | 7051402.67 | 20.00 | 2.86 | 827.60 | true | 0.199861;0.199079;0.200672;0.200272;0.200328 | 20414464;20414464;20414464;20414464;20414464 | 87392;87904;85856;85600;87904 | 7043104;7050464;7031296;7061120;7060640 |
151 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 207 | 295936 | 12001280 | 172012544 | GPU_0_bfc | 11705344 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.67 | 0 | 1606080.00 | 1106794.67 | 44.10 | 0.00 | 0.00 | true | 0.446225;0.441800;0.442687;0.439186;0.439344 | 0;0;0;0;0 | 1606080;1606080;1606080;1606080;1606080 | 1105696;1101664;1113056;1108384;1106304 |
151 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 207 | 295936 | 12001280 | 172012544 | GPU_0_bfc | 11705344 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 260128.00 | 632960.00 | 2.30 | 1.52 | 123.24 | true | 0.023162;0.023177;0.023169;0.023172;0.023176 | 1355648;1355648;1355648;1355648;1355648 | 265504;260128;260128;260128;260128 | 627712;634048;640320;637120;627008 |
151 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 207 | 295936 | 12001280 | 172012544 | GPU_0_bfc | 11705344 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1453568 | 5280.00 | 37237.33 | 2.60 | 34.19 | 242.26 | false | 0.026237;0.026238;0.026235;0.026249;0.026245 | 1453568;1453568;1453568;1453568;1453568 | 5536;5280;5280;5280;5280 | 36896;37536;35872;38944;37280 |
152 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 29.667 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 64736 | 260160.00 | 17920.00 | 46.00 | 0.23 | 12.95 | true | 0.460985;0.460161;0.460583;0.459846;0.460539 | 64736;64736;64736;64736;64736 | 260160;260160;260160;260160;260160 | 18048;17152;19328;18560;16128 |
153 | InceptionV4/InceptionV4/Mixed_6b/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 21.667 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 73984 | 1344.00 | 426.67 | 45.00 | 41.78 | 17.07 | false | 0.448973;0.450085;0.454538;0.447565;0.449643 | 73984;73984;73984;73984;73984 | 1344;1344;1344;2368;1344 | 384;128;384;512;2560 |
154 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 21.333 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 426.67 | 43.90 | 0.00 | 0.00 | true | 0.439302;0.435240;0.439192;0.439938;0.440449 | 0;0;0;0;0 | 256;2048;256;256;256 | 256;256;800;512;512 |
155 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190 | 481024 | 11137536 | 172234496 | GPU_0_bfc | 10656512 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 35.67 | 333032448 | 5118709.33 | 2266101.33 | 9.90 | 45.10 | 9337.27 | false | 0.099167;0.099247;0.099410;0.099377;0.099174 | 333032448;333032448;333032448;333032448;333032448 | 5229600;5074400;5058400;4946080;5223328 | 2265344;2266624;2266336;2288224;2242944 |
155 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190 | 481024 | 11137536 | 172234496 | GPU_0_bfc | 10656512 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.33 | 17862656 | 28437.33 | 5877152.00 | 20.00 | 3.02 | 1093.65 | true | 0.200906;0.200400;0.200163;0.199992;0.199942 | 17862656;17862656;17862656;17862656;17862656 | 32832;27840;26944;26944;30528 | 5865280;5881472;5884704;5862464;5884736 |
155 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190 | 481024 | 11137536 | 172234496 | GPU_0_bfc | 10656512 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1405184.00 | 473642.67 | 43.70 | 0.00 | 0.00 | true | 0.436941;0.438883;0.437153;0.434473;0.437650 | 0;0;0;0;0 | 480960;470784;471840;478304;453568 | 1410304;1405184;1405184;1405184;1405184 |
155 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190 | 481024 | 11137536 | 172234496 | GPU_0_bfc | 10656512 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 832437.33 | 2.30 | 1.24 | 123.24 | true | 0.023168;0.023158;0.023183;0.023156;0.023172 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259360 | 839488;828320;818464;829504;844640 |
155 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190 | 481024 | 11137536 | 172234496 | GPU_0_bfc | 10656512 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 1376.00 | 42666.67 | 2.30 | 28.88 | 254.37 | false | 0.022956;0.022975;0.022975;0.022970;0.022956 | 1271872;1271872;1271872;1271872;1271872 | 1280;1536;1312;1696;1184 | 40448;42400;45152;46432;38400 |
156 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 28.667 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 640.00 | 45.70 | 34.88 | 16.18 | false | 0.457948;0.459203;0.456966;0.456637;0.456527 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 640;640;512;640;640 |
157 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 21.333 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 96.00 | 554.67 | 44.90 | 0.00 | 0.00 | true | 0.448895;0.448451;0.448668;0.448412;0.448726 | 0;0;0;0;0 | 96;96;4960;96;96 | 640;512;640;256;512 |
158 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 178.667 | 295936 | 2796288 | 172271360 | GPU_0_bfc | 2500352 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 81.00 | 256975104 | 6933.33 | 148789.33 | 3.10 | 1650.21 | 3172.53 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 6848;6848;11968;7104;6848 | 146624;153568;150976;142176;148768 |
158 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 178.667 | 295936 | 2796288 | 172271360 | GPU_0_bfc | 2500352 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1605888.00 | 777472.00 | 43.80 | 0.00 | 0.00 | true | 0.438959;0.435324;0.434833;0.440576;0.438449 | 0;0;0;0;0 | 784352;770016;771904;776160;786176 | 1605888;1605888;1605888;1605888;1605888 |
159 | InceptionV4/InceptionV4/Mixed_6b/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 31.667 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1024.00 | 12256.00 | 44.40 | 5.57 | 18.50 | true | 0.444009;0.444890;0.444349;0.443734;0.443276 | 73984;73984;73984;73984;73984 | 1024;1024;1024;1024;1024 | 11936;12288;12544;12832;11904 |
161 | InceptionV4/InceptionV4/Mixed_6b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 25 | 1183744 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 44032.00 | 61.30 | 0.00 | 0.00 | true | 0.613081;0.613046;0.613000;0.613901;0.615460 | 0;0;0;0;0 | 44128;42208;44512;44192;43776 | 0;0;0;0;0 |
162 | InceptionV4/InceptionV4/Mixed_6c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 49.667 | 1479936 | 1479936 | 173270272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 14.00 | 5985412 | 8554.67 | 752128.00 | 52.90 | 7.87 | 427.53 | true | 0.530209;0.531607;0.528113;0.529737;0.528454 | 5985412;5985412;5985412;5985412;5985412 | 8512;8512;8576;13632;8576 | 751072;754464;752096;752640;751648 |
163 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 146.333 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 0.00 | 472170.67 | 3.10 | 266.61 | 2375.18 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 480864;469504;459104;482976;466144 | 0;256;0;0;0 |
163 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 146.333 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.67 | 0 | 786432.00 | 523072.00 | 43.90 | 0.00 | 0.00 | true | 0.438184;0.439993;0.439240;0.443830;0.438251 | 0;0;0;0;0 | 786432;786432;786496;786432;786432 | 514080;526592;534784;512160;528544 |
164 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 140 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 0.00 | 116362.67 | 3.10 | 1081.83 | 2375.18 | false | 0.031245;0.031247;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 114016;121824;114400;113760;120672 |
164 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 140 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 106272.00 | 43.10 | 0.00 | 0.00 | true | 0.433281;0.429527;0.435073;0.428788;0.428378 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 108448;101088;108064;108576;102304 |
165 | InceptionV4/InceptionV4/Mixed_6c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 168 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 78.33 | 251769216 | 2048.00 | 354357.33 | 4.80 | 706.41 | 3214.09 | false | 0.047518;0.047234;0.047479;0.047963;0.047566 | 251769216;251769216;251769216;251769216;251769216 | 358464;353216;344480;358496;351392 | 1920;1920;2432;1920;2304 |
165 | InceptionV4/InceptionV4/Mixed_6c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 168 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.67 | 0 | 1572928.00 | 1203573.33 | 44.30 | 0.00 | 0.00 | true | 0.443328;0.442062;0.454048;0.440488;0.443604 | 0;0;0;0;0 | 1572928;1572928;1572928;1572864;1572928 | 1196448;1204352;1212032;1200416;1205952 |
166 | InceptionV4/InceptionV4/Mixed_6c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 151.333 | 147968 | 1183744 | 173122304 | GPU_0_bfc | 1035776 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.00 | 83923072 | 782890.67 | 244576.00 | 3.10 | 81.68 | 1252.58 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 783488;781312;790656;782208;782976 | 245984;243136;245184;245408;240224 |
166 | InceptionV4/InceptionV4/Mixed_6c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 151.333 | 147968 | 1183744 | 173122304 | GPU_0_bfc | 1035776 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 524288.00 | 277066.67 | 43.20 | 0.00 | 0.00 | true | 0.431971;0.434621;0.433031;0.430864;0.428576 | 0;0;0;0;0 | 524288;524288;524288;524288;524288 | 277824;278496;271648;274880;280992 |
167 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 28 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 223232.00 | 6186.67 | 45.70 | 0.24 | 13.87 | true | 0.457998;0.456292;0.457373;0.462506;0.456734 | 55488;55488;55488;55488;55488 | 228864;223232;223232;223232;223232 | 13568;6272;6144;6144;6144 |
168 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 22.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 207872.00 | 45888.00 | 44.90 | 0.22 | 13.87 | true | 0.446247;0.448514;0.448337;0.449828;0.450550 | 55488;55488;55488;55488;55488 | 207872;207872;207872;207872;207872 | 45568;45984;45984;45696;46112 |
169 | InceptionV4/InceptionV4/Mixed_6c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 23.333 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 344213.33 | 51.90 | 0.32 | 27.74 | true | 0.519980;0.517606;0.517784;0.521163;0.520052 | 110976;110976;110976;110976;110976 | 1536;1536;1536;1536;1536 | 338848;346880;347136;346912;335616 |
170 | InceptionV4/InceptionV4/Mixed_6c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 20 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 512.00 | 17973.33 | 43.40 | 2.00 | 9.25 | true | 0.434009;0.433940;0.433998;0.435960;0.433277 | 36992;36992;36992;36992;36992 | 512;512;512;6400;512 | 18944;16672;23072;18304;11552 |
171 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 256.00 | 43434.67 | 44.00 | 0.00 | 0.00 | true | 0.439421;0.440799;0.439385;0.440332;0.440159 | 0;0;0;0;0 | 256;256;256;256;512 | 43392;43264;43648;43904;43136 |
172 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 896.00 | 43.60 | 0.00 | 0.00 | true | 0.435843;0.435823;0.436085;0.435800;0.435754 | 0;0;0;0;0 | 0;256;0;0;0 | 896;768;1024;1024;512 |
173 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 191 | 221952 | 7603712 | 171864320 | GPU_0_bfc | 7381760 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.00 | 244930560 | 3978.67 | 1098858.67 | 8.70 | 222.09 | 8164.35 | false | 0.086735;0.086595;0.086771;0.086656;0.086741 | 244930560;244930560;244930560;244930560;244930560 | 1101600;1092448;1102528;1085504;1105728 | 2912;4576;5280;1824;4448 |
173 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 191 | 221952 | 7603712 | 171864320 | GPU_0_bfc | 7381760 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 13.33 | 13123584 | 2368.00 | 4023221.33 | 16.80 | 3.26 | 984.29 | true | 0.167639;0.169389;0.168639;0.167269;0.168219 | 13123584;13123584;13123584;13123584;13123584 | 3985152;4040640;4013312;4023392;4032960 | 2368;2368;2432;2368;2368 |
173 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 191 | 221952 | 7603712 | 171864320 | GPU_0_bfc | 7381760 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 222624.00 | 821952.00 | 2.00 | 1.11 | 105.63 | true | 0.019962;0.019962;0.019995;0.019990;0.020006 | 1161984;1161984;1161984;1161984;1161984 | 223136;222624;222624;222112;222624 | 832192;819616;814048;834752;808128 |
173 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 191 | 221952 | 7603712 | 171864320 | GPU_0_bfc | 7381760 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 969770.67 | 44.80 | 0.00 | 0.00 | true | 0.452338;0.442680;0.443905;0.451942;0.448127 | 0;0;0;0;0 | 965984;959968;978272;969056;974272 | 1032192;1032192;1032192;1032192;1032192 |
173 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 191 | 221952 | 7603712 | 171864320 | GPU_0_bfc | 7381760 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1090176 | 1386.67 | 265728.00 | 2.00 | 4.08 | 218.04 | true | 0.019932;0.019929;0.019945;0.019930;0.019931 | 1090176;1090176;1090176;1090176;1090176 | 266368;263488;265728;266272;265184 | 1344;1856;1472;1344;1344 |
174 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192 | 259072 | 8550400 | 171901440 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 2205386.67 | 1875264.00 | 10.00 | 70.03 | 9217.82 | false | 0.099784;0.099844;0.099810;0.099460;0.099741 | 285752320;285752320;285752320;285752320;285752320 | 2198496;2264928;2213728;2179680;2203936 | 1880512;1849888;1871776;1878784;1875232 |
174 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192 | 259072 | 8550400 | 171901440 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 15310848 | 5941.33 | 4862944.00 | 18.40 | 3.14 | 1020.72 | true | 0.183488;0.184523;0.182704;0.184978;0.185712 | 15310848;15310848;15310848;15310848;15310848 | 5344;6496;5984;4832;6496 | 4843968;4888480;4883040;4822944;4861824 |
174 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192 | 259072 | 8550400 | 171901440 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 228768.00 | 607818.67 | 2.00 | 1.39 | 77.47 | true | 0.019898;0.019898;0.019899;0.019898;0.019899 | 1161984;1161984;1161984;1161984;1161984 | 228768;228768;228768;228768;228768 | 616256;597344;606272;610464;606720 |
174 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192 | 259072 | 8550400 | 171901440 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204672.00 | 1056533.33 | 44.40 | 0.00 | 0.00 | true | 0.450495;0.443609;0.444914;0.443948;0.442628 | 0;0;0;0;0 | 1072064;1057280;1042848;1068832;1043488 | 1204928;1204672;1204672;1204672;1204672 |
174 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192 | 259072 | 8550400 | 171901440 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 7.00 | 1271872 | 7829.33 | 155488.00 | 2.30 | 7.79 | 181.70 | true | 0.023088;0.023089;0.023093;0.023087;0.023094 | 1271872;1271872;1271872;1271872;1271872 | 156032;155680;149760;154752;156224 | 6944;6688;8064;8992;8480 |
175 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 30.667 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 55488 | 223296.00 | 18218.67 | 45.90 | 0.23 | 12.81 | true | 0.458941;0.458637;0.459206;0.457432;0.459928 | 55488;55488;55488;55488;55488 | 225344;223296;223296;223296;223296 | 21760;17152;18176;15104;19328 |
176 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 22.333 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 72117.33 | 45.30 | 0.88 | 16.18 | true | 0.453427;0.454297;0.453442;0.453501;0.453161 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 69248;70272;72800;73280;74496 |
177 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 22.667 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 181.33 | 43.90 | 0.00 | 0.00 | true | 0.439267;0.439589;0.439405;0.438676;0.438977 | 0;0;0;0;0 | 128;0;416;0;416 | 256;256;256;256;256 |
178 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 19.333 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 597.33 | 43.70 | 0.00 | 0.00 | true | 0.437249;0.437428;0.437135;0.437352;0.437484 | 0;0;0;0;0 | 640;256;512;640;640 | 0;0;0;0;0 |
179 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 1711669.33 | 1929877.33 | 10.00 | 78.47 | 9217.82 | false | 0.100040;0.099711;0.099892;0.099914;0.100169 | 285752320;285752320;285752320;285752320;285752320 | 1640384;1729312;1718720;1762624;1686976 | 1940608;1915648;1923552;1951776;1925472 |
179 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.67 | 15310848 | 4672.00 | 4866506.67 | 18.20 | 3.14 | 977.27 | true | 0.181648;0.184151;0.183454;0.180691;0.181937 | 15310848;15310848;15310848;15310848;15310848 | 4855648;4888448;4856928;4851616;4886944 | 6272;4736;3328;4992;4288 |
179 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.67 | 1161984 | 222368.00 | 643669.33 | 2.00 | 1.34 | 79.22 | true | 0.019972;0.019981;0.019965;0.019963;0.019972 | 1161984;1161984;1161984;1161984;1161984 | 222368;222368;222368;222368;222368 | 643040;637056;648448;651872;639520 |
179 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.33 | 0 | 1204480.00 | 774656.00 | 43.90 | 0.00 | 0.00 | true | 0.435946;0.439533;0.438857;0.441234;0.439998 | 0;0;0;0;0 | 1204480;1204480;1204736;1204480;1204480 | 771040;777632;781536;775296;763712 |
179 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 1376.00 | 80160.00 | 2.30 | 15.60 | 254.37 | true | 0.023001;0.023001;0.023000;0.023011;0.023001 | 1271872;1271872;1271872;1271872;1271872 | 1472;1312;1344;992;1568 | 81792;78592;80096;73248;84928 |
180 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202.333 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 36.33 | 380608512 | 7048074.67 | 2596693.33 | 11.10 | 39.46 | 10475.56 | false | 0.110576;0.110261;0.110571;0.110482;0.110648 | 380608512;380608512;380608512;380608512;380608512 | 7074464;7016064;7096704;7039616;7030144 | 2585088;2616864;2588128;2634400;2576096 |
180 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202.333 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 25.00 | 20414464 | 87712.00 | 6968320.00 | 20.00 | 2.89 | 816.58 | true | 0.197073;0.199516;0.202269;0.200629;0.201204 | 20414464;20414464;20414464;20414464;20414464 | 6970656;6957152;6978368;6960864;6973440 | 91680;85344;91040;85536;86560 |
180 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202.333 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1606080.00 | 928245.33 | 43.90 | 0.00 | 0.00 | true | 0.438117;0.436555;0.441276;0.439380;0.439895 | 0;0;0;0;0 | 1606080;1606080;1611456;1606080;1606080 | 925984;937856;930624;928128;914944 |
180 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202.333 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259872.00 | 750112.00 | 2.30 | 1.34 | 123.24 | true | 0.023188;0.023187;0.023162;0.023160;0.023154 | 1355648;1355648;1355648;1355648;1355648 | 259872;259872;259872;259872;259872 | 722176;754272;748640;747424;755552 |
180 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202.333 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1453568 | 5365.33 | 41002.67 | 2.60 | 31.35 | 242.26 | false | 0.026260;0.026260;0.026265;0.026275;0.026260 | 1453568;1453568;1453568;1453568;1453568 | 5536;5280;5280;5280;11936 | 40960;43648;38272;40576;41472 |
181 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 29.667 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 64736 | 260160.00 | 34432.00 | 46.00 | 0.22 | 13.87 | true | 0.459505;0.460335;0.460175;0.459396;0.459524 | 64736;64736;64736;64736;64736 | 260160;260160;260160;260160;260160 | 35200;32768;35712;31232;35328 |
182 | InceptionV4/InceptionV4/Mixed_6c/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 22.667 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 73984 | 1344.00 | 426.67 | 44.90 | 41.78 | 14.80 | false | 0.452416;0.447916;0.449727;0.448318;0.448215 | 73984;73984;73984;73984;73984 | 1344;1344;1344;2368;1344 | 0;0;256;1024;1664 |
183 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 20 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 256.00 | 256.00 | 43.90 | 0.00 | 0.00 | true | 0.438976;0.439371;0.439694;0.439123;0.439629 | 0;0;0;0;0 | 256;256;256;256;256 | 256;256;256;256;256 |
184 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 443648 | 9783808 | 172197120 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 35.67 | 333032448 | 5129898.67 | 2335456.00 | 9.90 | 44.61 | 9337.27 | false | 0.099349;0.099256;0.099199;0.099393;0.099193 | 333032448;333032448;333032448;333032448;333032448 | 5188608;5181952;5086336;5052160;5121408 | 2349216;2318336;2292384;2341312;2346720 |
184 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 443648 | 9783808 | 172197120 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.67 | 17862656 | 30741.33 | 5858208.00 | 20.30 | 3.03 | 1071.74 | true | 0.202446;0.202430;0.201187;0.203067;0.202787 | 17862656;17862656;17862656;17862656;17862656 | 37440;30784;31296;28736;30144 | 5855104;5864800;5884416;5854720;5846784 |
184 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 443648 | 9783808 | 172197120 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1405184.00 | 488810.67 | 43.60 | 0.00 | 0.00 | true | 0.434321;0.436878;0.437199;0.434596;0.436927 | 0;0;0;0;0 | 1405184;1405184;1405184;1405184;1405184 | 491552;479328;495552;475552;503232 |
184 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 443648 | 9783808 | 172197120 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.00 | 1355648 | 259360.00 | 756288.00 | 2.30 | 1.33 | 135.56 | true | 0.023178;0.023176;0.023177;0.023171;0.023176 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259360 | 752192;755776;747232;760896;769696 |
184 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 443648 | 9783808 | 172197120 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.67 | 1271872 | 1056.00 | 41130.67 | 2.30 | 30.15 | 272.52 | false | 0.022971;0.022972;0.022973;0.022972;0.022973 | 1271872;1271872;1271872;1271872;1271872 | 40832;39424;44032;41600;40960 | 6176;800;1056;1184;928 |
185 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 28 | 443648 | 0 | 171938048 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 64736 | 1216.00 | 85.33 | 45.70 | 49.75 | 13.87 | false | 0.456333;0.457116;0.459787;0.457518;0.456374 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 768;256;0;0;0 |
186 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 20.333 | 443648 | 0 | 171938048 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 96.00 | 0.00 | 44.90 | 0.00 | 0.00 | true | 0.449423;0.448889;0.449379;0.448760;0.449157 | 0;0;0;0;0 | 96;96;96;96;96 | 0;256;0;0;0 |
187 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 178.333 | 295936 | 1901568 | 172233984 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 80.00 | 256975104 | 7360.00 | 248768.00 | 3.10 | 1003.31 | 3212.19 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 243936;255488;254560;247808;238432 | 5568;12224;10944;5568;5312 |
187 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 178.333 | 295936 | 1901568 | 172233984 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1605888.00 | 611210.67 | 43.70 | 0.00 | 0.00 | true | 0.436778;0.436257;0.436736;0.434217;0.437757 | 0;0;0;0;0 | 1605888;1605888;1605888;1605888;1605888 | 613152;610880;620576;603072;609600 |
188 | InceptionV4/InceptionV4/Mixed_6c/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 28.333 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1024.00 | 35594.67 | 44.50 | 2.02 | 18.50 | true | 0.444469;0.445093;0.442819;0.445684;0.444709 | 73984;73984;73984;73984;73984 | 1024;1024;1024;1024;1024 | 36736;28096;38912;34432;35616 |
190 | InceptionV4/InceptionV4/Mixed_6c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 25.333 | 1479936 | 0 | 172086528 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 253834.67 | 61.60 | 0.00 | 0.00 | true | 0.617204;0.615704;0.616878;0.615168;0.614523 | 0;0;0;0;0 | 252672;253984;253440;254080;254336 | 0;0;0;0;5120 |
191 | InceptionV4/InceptionV4/Mixed_6d/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 48.667 | 1183744 | 1183744 | 173270272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 14.00 | 5981137 | 4096.00 | 888288.00 | 52.80 | 6.70 | 427.22 | true | 0.528350;0.529488;0.528267;0.528311;0.528473 | 5981137;5981137;5981137;5981137;5981137 | 4096;4096;4096;9216;4096 | 888512;888352;888192;888320;888192 |
192 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 143 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 768.00 | 432074.67 | 3.10 | 290.83 | 2375.18 | false | 0.031247;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;2816;0;2304 | 433088;427616;430816;432320;434016 |
192 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 143 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786496.00 | 551466.67 | 43.80 | 0.00 | 0.00 | true | 0.436265;0.442094;0.435024;0.439559;0.436686 | 0;0;0;0;0 | 786496;786496;786496;786496;786496 | 550112;556192;552832;551456;550048 |
193 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 139 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 0.00 | 113664.00 | 3.10 | 1107.52 | 2375.18 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 118848;112736;108160;112352;115904 |
193 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 139 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 107018.67 | 44.50 | 0.00 | 0.00 | true | 0.446990;0.443125;0.447226;0.440264;0.444045 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 102304;108064;112800;108512;104480 |
194 | InceptionV4/InceptionV4/Mixed_6d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 176.333 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 78.00 | 251769216 | 0.00 | 398762.67 | 4.80 | 631.38 | 3227.81 | false | 0.047907;0.047712;0.047541;0.047427;0.047454 | 251769216;251769216;251769216;251769216;251769216 | 0;0;0;0;0 | 400768;402464;397120;392864;398400 |
194 | InceptionV4/InceptionV4/Mixed_6d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 176.333 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572864.00 | 1170709.33 | 44.40 | 0.00 | 0.00 | true | 0.448125;0.439601;0.442577;0.442114;0.449910 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 1169344;1168576;1171968;1176416;1170816 |
195 | InceptionV4/InceptionV4/Mixed_6d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.00 | 83923072 | 783317.33 | 282101.33 | 3.10 | 78.77 | 1252.58 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 780672;782464;783488;785920;784000 | 282720;282816;275648;282688;280896 |
195 | InceptionV4/InceptionV4/Mixed_6d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.67 | 0 | 524288.00 | 187712.00 | 42.80 | 0.00 | 0.00 | true | 0.430779;0.427530;0.426704;0.423418;0.430136 | 0;0;0;0;0 | 524288;524288;524288;524288;524288 | 188096;184832;194112;186144;188896 |
196 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 29 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 55488 | 222208.00 | 11221.33 | 45.70 | 0.24 | 12.81 | true | 0.457852;0.454748;0.457118;0.457440;0.458945 | 55488;55488;55488;55488;55488 | 223232;222208;222208;222208;222208 | 11520;12192;10496;10752;11392 |
197 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 23.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 55488 | 179456.00 | 41344.00 | 44.80 | 0.25 | 11.89 | true | 0.447629;0.448717;0.447146;0.447922;0.447128 | 55488;55488;55488;55488;55488 | 41344;40320;41600;41088;41600 | 179584;179584;179328;179456;179328 |
198 | InceptionV4/InceptionV4/Mixed_6d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 22 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 356277.33 | 51.90 | 0.31 | 27.74 | true | 0.516279;0.520689;0.518844;0.517737;0.520993 | 110976;110976;110976;110976;110976 | 1536;1536;1536;1536;1536 | 357952;354336;360512;353952;356544 |
199 | InceptionV4/InceptionV4/Mixed_6d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 22.333 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 512.00 | 14720.00 | 43.40 | 2.43 | 9.25 | true | 0.433758;0.432617;0.435063;0.435615;0.434250 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 21120;12672;15872;9472;15616 |
200 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 597.33 | 41898.67 | 44.10 | 0.00 | 0.00 | true | 0.440334;0.440595;0.441044;0.440777;0.441028 | 0;0;0;0;0 | 256;2048;256;1280;256 | 42240;41600;41856;42240;41344 |
201 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 2005.33 | 43.60 | 0.00 | 0.00 | true | 0.435467;0.435780;0.435331;0.436073;0.435681 | 0;0;0;0;0 | 0;0;0;0;0 | 2176;1664;2176;1280;3072 |
202 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187.667 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.00 | 244930560 | 3317.33 | 1091210.67 | 8.70 | 223.78 | 8164.35 | false | 0.086731;0.086864;0.086667;0.086669;0.086686 | 244930560;244930560;244930560;244930560;244930560 | 2720;3872;4768;3232;2848 | 1091392;1082560;1074720;1126144;1099680 |
202 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187.667 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 13.33 | 13123584 | 2581.33 | 3802261.33 | 16.90 | 3.45 | 984.29 | true | 0.168986;0.169598;0.168476;0.168623;0.169573 | 13123584;13123584;13123584;13123584;13123584 | 2624;2368;2624;3136;2496 | 3824320;3790624;3818912;3779936;3797248 |
202 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187.667 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 222880.00 | 815754.67 | 2.00 | 1.12 | 105.63 | true | 0.020000;0.019980;0.019989;0.019986;0.019979 | 1161984;1161984;1161984;1161984;1161984 | 223008;222880;222240;222880;222880 | 810816;818144;811232;818528;817888 |
202 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187.667 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032277.33 | 361973.33 | 44.00 | 0.00 | 0.00 | true | 0.439740;0.439049;0.441768;0.438297;0.440538 | 0;0;0;0;0 | 1032192;1032192;1032192;1032448;1033984 | 361792;361696;362432;343744;362464 |
202 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 187.667 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.67 | 1090176 | 1386.67 | 268309.33 | 2.00 | 4.04 | 233.59 | true | 0.019935;0.019939;0.019933;0.019928;0.019940 | 1090176;1090176;1090176;1090176;1090176 | 1472;1600;1344;1344;1344 | 265376;273312;272224;266528;266176 |
203 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 194.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 2151840.00 | 1802048.00 | 10.00 | 72.27 | 9217.82 | false | 0.099835;0.099897;0.099899;0.099944;0.099762 | 285752320;285752320;285752320;285752320;285752320 | 2138080;2206112;2178592;2113248;2138848 | 1802688;1812544;1799200;1804256;1795328 |
203 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 194.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.67 | 15310848 | 4874.67 | 4838272.00 | 18.50 | 3.16 | 977.27 | true | 0.184058;0.185725;0.185875;0.184867;0.183059 | 15310848;15310848;15310848;15310848;15310848 | 4865024;4825056;4827552;4840224;4847040 | 3936;4832;5344;5984;4448 |
203 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 194.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.33 | 1161984 | 228768.00 | 608768.00 | 2.00 | 1.39 | 81.07 | true | 0.019898;0.019897;0.019897;0.019898;0.019898 | 1161984;1161984;1161984;1161984;1161984 | 228768;228768;228768;228768;228768 | 604000;611968;623328;598528;610336 |
203 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 194.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204672.00 | 1067893.33 | 44.40 | 0.00 | 0.00 | true | 0.447468;0.444582;0.440239;0.441989;0.443936 | 0;0;0;0;0 | 1204672;1204672;1204672;1204672;1204672 | 1051008;1085856;1093824;1058848;1058976 |
203 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 194.333 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.33 | 1271872 | 7093.33 | 109557.33 | 2.30 | 10.90 | 200.83 | true | 0.023094;0.023088;0.023090;0.023090;0.023092 | 1271872;1271872;1271872;1271872;1271872 | 111008;100352;111872;105792;112032 | 6944;12448;7136;7200;6816 |
204 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 29 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 55488 | 223296.00 | 44629.33 | 45.90 | 0.21 | 11.10 | true | 0.458342;0.458554;0.459986;0.459837;0.458493 | 55488;55488;55488;55488;55488 | 223552;223296;223296;223296;223296 | 45440;42240;44928;45184;43776 |
205 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 22 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 1322.67 | 45.30 | 25.50 | 16.18 | false | 0.452989;0.453684;0.452982;0.452201;0.454209 | 64736;64736;64736;64736;64736 | 1408;1024;1664;1024;1536 | 1216;1216;1216;1216;1216 |
206 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 22.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 34016.00 | 43.90 | 0.00 | 0.00 | true | 0.439779;0.439473;0.440215;0.439117;0.439032 | 0;0;0;0;0 | 256;256;256;256;256 | 35712;34592;32128;35200;32256 |
207 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 19 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 512.00 | 43.80 | 0.00 | 0.00 | true | 0.437432;0.437948;0.438139;0.437444;0.437694 | 0;0;0;0;0 | 0;0;0;0;0 | 512;512;512;384;640 |
208 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 1766922.67 | 1916266.67 | 10.00 | 77.58 | 9217.82 | false | 0.100034;0.100013;0.099779;0.099886;0.099889 | 285752320;285752320;285752320;285752320;285752320 | 1760736;1761504;1762848;1829344;1776416 | 1924928;1920256;1875648;1916320;1912224 |
208 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.67 | 15310848 | 7381.33 | 4950890.67 | 18.20 | 3.09 | 977.27 | true | 0.181490;0.184417;0.181864;0.182279;0.181534 | 15310848;15310848;15310848;15310848;15310848 | 8320;7104;7616;6528;7424 | 4940608;4940480;4954432;4957632;4974976 |
208 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 222368.00 | 610720.00 | 2.00 | 1.39 | 77.47 | true | 0.019963;0.019960;0.019963;0.019968;0.019959 | 1161984;1161984;1161984;1161984;1161984 | 607712;617888;620960;606560;601056 | 222368;222368;222240;222368;222368 |
208 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204480.00 | 833504.00 | 43.90 | 0.00 | 0.00 | true | 0.443700;0.439181;0.437665;0.440139;0.437622 | 0;0;0;0;0 | 1204480;1205504;1204480;1204480;1204480 | 833312;833184;842688;834016;830912 |
208 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 190.667 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 2378.67 | 98464.00 | 2.30 | 12.61 | 254.37 | true | 0.023008;0.022999;0.022996;0.022996;0.022996 | 1271872;1271872;1271872;1271872;1271872 | 2368;1600;3744;2144;2624 | 104160;96416;103392;95584;95424 |
209 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 36.00 | 380608512 | 7046400.00 | 2695658.67 | 11.10 | 39.07 | 10572.46 | false | 0.110578;0.110532;0.110574;0.110383;0.110413 | 380608512;380608512;380608512;380608512;380608512 | 7052416;7019520;7063808;7090560;7022976 | 2688480;2709920;2697152;2701344;2687808 |
209 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 25.67 | 20414464 | 101578.67 | 7117056.00 | 20.10 | 2.83 | 795.36 | true | 0.200330;0.200886;0.202581;0.199851;0.200602 | 20414464;20414464;20414464;20414464;20414464 | 107424;103072;98784;102112;99552 | 7120000;7120704;7111776;7114528;7116640 |
209 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1606080.00 | 1026485.33 | 43.90 | 0.00 | 0.00 | true | 0.438852;0.441329;0.440361;0.439200;0.438687 | 0;0;0;0;0 | 1606080;1606080;1606336;1606080;1606080 | 1018176;1027744;1029632;1025184;1026528 |
209 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.00 | 1355648 | 259872.00 | 653120.00 | 2.30 | 1.48 | 135.56 | true | 0.023174;0.023174;0.023187;0.023173;0.023173 | 1355648;1355648;1355648;1355648;1355648 | 259872;259872;259872;259872;259872 | 641024;648608;665984;660256;650496 |
209 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1453568 | 5280.00 | 46496.00 | 2.60 | 28.07 | 242.26 | false | 0.026240;0.026237;0.026259;0.026235;0.026245 | 1453568;1453568;1453568;1453568;1453568 | 5536;5280;5280;5280;5280 | 44064;47520;47136;47264;45088 |
210 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 29 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 64736 | 260160.00 | 18688.00 | 46.00 | 0.23 | 13.87 | true | 0.459904;0.458416;0.458370;0.460624;0.460847 | 64736;64736;64736;64736;64736 | 260160;260160;260160;260160;268864 | 20352;16000;17280;18944;19840 |
211 | InceptionV4/InceptionV4/Mixed_6d/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 22 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1344.00 | 2944.00 | 44.90 | 17.25 | 18.50 | true | 0.449615;0.449593;0.447529;0.448512;0.455341 | 73984;73984;73984;73984;73984 | 1344;1344;1344;1344;1344 | 1536;1280;1024;6016;9984 |
212 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 21 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 256.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.439642;0.439441;0.439275;0.439590;0.439320 | 0;0;0;0;0 | 256;256;256;256;256 | 0;0;0;0;0 |
213 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 35.67 | 333032448 | 5090762.67 | 2381557.33 | 9.90 | 44.57 | 9337.27 | false | 0.099409;0.099041;0.099199;0.099305;0.099276 | 333032448;333032448;333032448;333032448;333032448 | 2377280;2380640;2401312;2386752;2336960 | 5040928;5087648;5094816;5089824;5149344 |
213 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 17.00 | 17862656 | 23829.33 | 5796832.00 | 20.20 | 3.07 | 1050.74 | true | 0.202104;0.202326;0.200897;0.202163;0.202359 | 17862656;17862656;17862656;17862656;17862656 | 23360;22080;23232;24896;26688 | 5795232;5796288;5783520;5808384;5798976 |
213 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1405184.00 | 483445.33 | 43.70 | 0.00 | 0.00 | true | 0.434864;0.437973;0.435648;0.438160;0.439403 | 0;0;0;0;0 | 1405184;1405184;1405184;1405184;1405184 | 499008;477760;493120;473536;479456 |
213 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.67 | 1355648 | 259360.00 | 773098.67 | 2.30 | 1.31 | 127.09 | true | 0.023167;0.023173;0.023167;0.023182;0.023176 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259360 | 769728;768832;774592;775200;774976 |
213 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 842.67 | 16128.00 | 2.30 | 74.95 | 254.37 | false | 0.022971;0.022972;0.022971;0.022974;0.022972 | 1271872;1271872;1271872;1271872;1271872 | 800;800;800;928;928 | 16768;15840;18336;15520;15776 |
214 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 28.333 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 64736 | 1216.00 | 128.00 | 45.70 | 48.17 | 14.94 | false | 0.457254;0.455957;0.459047;0.456201;0.456520 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 0;256;256;128;0 |
215 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 21.667 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 96.00 | 170.67 | 44.90 | 0.00 | 0.00 | true | 0.449543;0.448395;0.448604;0.448839;0.448914 | 0;0;0;0;0 | 0;256;256;0;256 | 96;96;96;96;96 |
216 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 171.333 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 79.67 | 256975104 | 5738.67 | 152362.67 | 3.10 | 1625.38 | 3225.62 | false | 0.031249;0.031249;0.031249;0.031248;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 5568;11456;6080;5568;5568 | 156448;149888;154752;150656;151680 |
216 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 171.333 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1605888.00 | 856693.33 | 44.00 | 0.00 | 0.00 | true | 0.435913;0.444013;0.439649;0.441943;0.438811 | 0;0;0;0;0 | 1605888;1605888;1605888;1605888;1605888 | 854432;860192;855456;851872;893088 |
217 | InceptionV4/InceptionV4/Mixed_6d/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 28 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1024.00 | 10677.33 | 44.50 | 6.32 | 18.50 | true | 0.444033;0.443790;0.444161;0.445827;0.445660 | 73984;73984;73984;73984;73984 | 1024;1024;1024;1024;1024 | 10496;10656;11648;10880;9120 |
219 | InceptionV4/InceptionV4/Mixed_6d/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 24.667 | 1627648 | 0 | 172234240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 42677.33 | 61.40 | 0.00 | 0.00 | true | 0.615680;0.613159;0.615781;0.613410;0.613844 | 0;0;0;0;0 | 0;0;0;0;5120 | 44032;42624;40576;41888;43520 |
220 | InceptionV4/InceptionV4/Mixed_6e/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 49 | 1479936 | 1479936 | 173714176 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 13.67 | 6232897 | 4117.33 | 677589.33 | 52.80 | 9.14 | 456.05 | true | 0.527521;0.528677;0.527159;0.526934;0.530051 | 6232897;6232897;6232897;6232897;6232897 | 4096;4096;4096;4160;4160 | 677632;677856;675456;677280;679136 |
221 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141.333 | 221952 | 1008384 | 173936128 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 170.67 | 424789.33 | 3.10 | 296.23 | 2375.18 | false | 0.031247;0.031247;0.031247;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;512;0;3328 | 428160;424960;427136;421120;422272 |
221 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141.333 | 221952 | 1008384 | 173936128 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 584640.00 | 43.70 | 0.00 | 0.00 | true | 0.439727;0.437892;0.440918;0.434219;0.431668 | 0;0;0;0;0 | 786432;786432;786560;786432;786432 | 581632;584864;580128;588320;587424 |
222 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 140.333 | 221952 | 1008384 | 174158080 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 0.00 | 89792.00 | 3.10 | 1401.96 | 2375.18 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 88512;91072;86464;89792;94912 |
222 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 140.333 | 221952 | 1008384 | 174158080 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 132224.00 | 43.80 | 0.00 | 0.00 | true | 0.438692;0.436443;0.434067;0.439270;0.438163 | 0;0;0;0;0 | 133408;130976;135456;132288;127008 | 786432;786432;786432;786432;786432 |
223 | InceptionV4/InceptionV4/Mixed_6e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 168.667 | 443904 | 2016768 | 174601984 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 78.00 | 251769216 | 15061.33 | 502848.00 | 4.70 | 486.13 | 3227.81 | false | 0.047825;0.047248;0.047508;0.047508;0.047473 | 251769216;251769216;251769216;251769216;251769216 | 14464;16256;15744;14592;14848 | 505152;503360;502464;498848;502720 |
223 | InceptionV4/InceptionV4/Mixed_6e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 168.667 | 443904 | 2016768 | 174601984 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1572928.00 | 1190901.33 | 44.10 | 0.00 | 0.00 | true | 0.438286;0.444163;0.441370;0.440596;0.441703 | 0;0;0;0;0 | 1572928;1572928;1572928;1572928;1572928 | 1190880;1191648;1190624;1191200;1188512 |
224 | InceptionV4/InceptionV4/Mixed_6e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152 | 147968 | 672256 | 173122304 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.33 | 83923072 | 779178.67 | 195381.33 | 3.10 | 86.11 | 1246.39 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 779008;779264;778496;779264;781568 | 196128;195264;196160;194752;189760 |
224 | InceptionV4/InceptionV4/Mixed_6e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 152 | 147968 | 672256 | 173122304 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 524288.00 | 212810.67 | 42.90 | 0.00 | 0.00 | true | 0.430737;0.429054;0.429028;0.428987;0.422560 | 0;0;0;0;0 | 524288;524288;529664;524288;524288 | 212224;209824;213440;212768;217824 |
225 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 29 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 222464.00 | 554.67 | 45.70 | 0.25 | 13.87 | true | 0.456491;0.455502;0.457922;0.456799;0.458896 | 55488;55488;55488;55488;55488 | 222592;222080;222336;222720;222464 | 640;384;640;384;640 |
226 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 22 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 55488 | 222720.00 | 95786.67 | 44.50 | 0.17 | 12.81 | true | 0.445591;0.445200;0.443220;0.445187;0.442727 | 55488;55488;55488;55488;55488 | 96000;95232;96000;95360;96000 | 222720;222720;222720;222720;222720 |
227 | InceptionV4/InceptionV4/Mixed_6e/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 23.333 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 255306.67 | 51.90 | 0.43 | 27.74 | true | 0.520168;0.517154;0.517790;0.518796;0.519131 | 110976;110976;110976;110976;110976 | 1536;1536;1536;1536;1536 | 254880;247072;256032;255008;267040 |
228 | InceptionV4/InceptionV4/Mixed_6e/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 21 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 512.00 | 10752.00 | 43.50 | 3.28 | 9.25 | true | 0.434430;0.434040;0.435846;0.436119;0.433582 | 36992;36992;36992;36992;36992 | 512;512;512;7168;512 | 7936;3584;12032;19072;12288 |
229 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 938.67 | 88704.00 | 43.90 | 0.00 | 0.00 | true | 0.439526;0.440121;0.439176;0.438888;0.438791 | 0;0;0;0;0 | 256;2304;256;5888;256 | 93696;86784;94592;85632;67840 |
230 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 256.00 | 43.60 | 0.00 | 0.00 | true | 0.435724;0.435566;0.436338;0.435190;0.435432 | 0;0;0;0;0 | 0;0;0;0;0 | 256;256;256;256;256 |
231 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.00 | 244930560 | 4362.67 | 1042666.67 | 8.70 | 233.93 | 8164.35 | false | 0.086712;0.086747;0.086581;0.086615;0.086560 | 244930560;244930560;244930560;244930560;244930560 | 1035872;1014080;1047712;1059040;1044416 | 5856;4576;3424;3168;5088 |
231 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 13.67 | 13123584 | 2346.67 | 3846858.67 | 16.90 | 3.41 | 960.24 | true | 0.169168;0.167654;0.169016;0.169064;0.169596 | 13123584;13123584;13123584;13123584;13123584 | 3865408;3885568;3832480;3841440;3833728 | 2688;2368;2304;2304;2368 |
231 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 222794.67 | 757856.00 | 2.00 | 1.18 | 105.63 | true | 0.019986;0.019984;0.019962;0.019971;0.020001 | 1161984;1161984;1161984;1161984;1161984 | 223136;222880;222624;222752;222752 | 750176;750016;768256;756480;766912 |
231 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 345930.67 | 43.50 | 0.00 | 0.00 | true | 0.434428;0.435502;0.435474;0.436349;0.433699 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1032192 | 349568;345632;342592;340352;353184 |
231 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 185.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.67 | 1090176 | 8864.00 | 293280.00 | 2.00 | 3.61 | 233.59 | true | 0.019942;0.019949;0.019956;0.019948;0.019953 | 1090176;1090176;1090176;1090176;1090176 | 281920;294752;292992;293696;293152 | 9760;8512;8320;8064;9792 |
232 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 1920330.67 | 1833568.00 | 10.00 | 76.12 | 9217.82 | false | 0.099564;0.099831;0.100143;0.099773;0.099631 | 285752320;285752320;285752320;285752320;285752320 | 1831744;1835424;1819904;1833536;1842304 | 1934336;1933856;1865632;1910816;1916320 |
232 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.67 | 15310848 | 6154.67 | 4856181.33 | 18.60 | 3.15 | 977.27 | true | 0.185520;0.186530;0.185975;0.184329;0.185359 | 15310848;15310848;15310848;15310848;15310848 | 5728;6240;5216;6880;6496 | 4866944;4838432;4863488;4854016;4851040 |
232 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 228768.00 | 662261.33 | 2.00 | 1.30 | 77.47 | true | 0.019898;0.019898;0.019898;0.019898;0.019898 | 1161984;1161984;1161984;1161984;1161984 | 662240;661472;654720;669184;663072 | 228768;228768;228768;228768;228768 |
232 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204672.00 | 1083722.67 | 44.40 | 0.00 | 0.00 | true | 0.442621;0.445956;0.442508;0.444699;0.445960 | 0;0;0;0;0 | 1204672;1204672;1204672;1204672;1204672 | 1081632;1088768;1085248;1073088;1084288 |
232 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.667 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1271872 | 13717.33 | 70464.00 | 2.30 | 15.11 | 211.98 | true | 0.023087;0.023087;0.023089;0.023087;0.023087 | 1271872;1271872;1271872;1271872;1271872 | 16032;18560;11680;12288;12832 | 70208;67616;75648;70656;70528 |
233 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 29.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 55488 | 223296.00 | 19338.67 | 45.80 | 0.23 | 11.10 | true | 0.458463;0.458434;0.458264;0.458263;0.459864 | 55488;55488;55488;55488;55488 | 223296;223296;223296;223296;223296 | 19456;18048;19296;19488;19264 |
234 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 23.667 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 23392.00 | 45.40 | 2.63 | 16.18 | true | 0.454239;0.454413;0.453392;0.453157;0.451968 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 23776;22752;22880;24192;23520 |
235 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 21.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 256.00 | 2656.00 | 43.90 | 0.00 | 0.00 | true | 0.439424;0.439764;0.439158;0.438997;0.439840 | 0;0;0;0;0 | 256;256;256;256;256 | 2560;2560;2432;4480;2848 |
236 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 21.333 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 341.33 | 43.70 | 0.00 | 0.00 | true | 0.437277;0.437658;0.437123;0.437718;0.437250 | 0;0;0;0;0 | 256;512;0;256;512 | 0;0;0;0;0 |
237 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 188 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 1486144.00 | 1870101.33 | 10.00 | 85.14 | 9217.82 | false | 0.099848;0.099872;0.100377;0.100335;0.099899 | 285752320;285752320;285752320;285752320;285752320 | 1527552;1498528;1471136;1467392;1488768 | 1882624;1864576;1863104;1849152;1894752 |
237 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 188 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 5141.33 | 4914101.33 | 18.10 | 3.11 | 956.93 | true | 0.180096;0.181299;0.179061;0.183190;0.180262 | 15310848;15310848;15310848;15310848;15310848 | 4866656;4898048;4931296;4923520;4920736 | 5440;4416;4544;5696;5440 |
237 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 188 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 222368.00 | 698656.00 | 2.00 | 1.26 | 77.47 | true | 0.019972;0.019971;0.019968;0.019969;0.019966 | 1161984;1161984;1161984;1161984;1161984 | 720768;700768;696032;697568;697632 | 222368;222368;222368;222368;222368 |
237 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 188 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.67 | 0 | 1204480.00 | 834997.33 | 44.00 | 0.00 | 0.00 | true | 0.438833;0.440804;0.440718;0.439368;0.439003 | 0;0;0;0;0 | 1204480;1204736;1204480;1204480;1204480 | 839168;843520;827456;831808;834016 |
237 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 188 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 20032.00 | 81728.00 | 2.30 | 12.50 | 254.37 | true | 0.023005;0.023002;0.023011;0.023013;0.023009 | 1271872;1271872;1271872;1271872;1271872 | 20992;23872;17664;19072;20032 | 75104;82688;81504;83808;80992 |
238 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 36.67 | 380608512 | 6943306.67 | 2716117.33 | 11.10 | 39.40 | 10380.14 | false | 0.110542;0.110526;0.110493;0.110610;0.110442 | 380608512;380608512;380608512;380608512;380608512 | 6931104;6969376;6897184;7027872;6929440 | 2717120;2709216;2720480;2710752;2748384 |
238 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 24.67 | 20414464 | 66400.00 | 6921376.00 | 20.20 | 2.92 | 827.60 | true | 0.201579;0.200145;0.204203;0.202165;0.201709 | 20414464;20414464;20414464;20414464;20414464 | 67424;70880;64480;67296;64480 | 6924192;6921600;6922144;6920384;6896768 |
238 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.67 | 0 | 1606080.00 | 925098.67 | 44.00 | 0.00 | 0.00 | true | 0.439058;0.439030;0.443016;0.434686;0.440646 | 0;0;0;0;0 | 1606080;1606080;1606080;1606080;1606080 | 932288;929696;925504;920096;906304 |
238 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259872.00 | 748490.67 | 2.30 | 1.34 | 123.24 | true | 0.023176;0.023188;0.023166;0.023191;0.023162 | 1355648;1355648;1355648;1355648;1355648 | 259872;259872;259872;259872;259872 | 749216;739104;749376;746880;759616 |
238 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 205 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1453568 | 5280.00 | 60416.00 | 2.60 | 22.13 | 242.26 | false | 0.026284;0.026261;0.026260;0.026261;0.026266 | 1453568;1453568;1453568;1453568;1453568 | 5536;5280;5280;5280;5280 | 63744;59136;61824;58112;60288 |
239 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 29.667 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 64736 | 260160.00 | 31616.00 | 46.00 | 0.22 | 14.94 | true | 0.461428;0.459024;0.462421;0.459637;0.459366 | 64736;64736;64736;64736;64736 | 260160;260160;260160;260160;265536 | 31104;29440;32256;32384;31488 |
240 | InceptionV4/InceptionV4/Mixed_6e/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 21.667 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1685.33 | 1536.00 | 44.90 | 22.97 | 18.50 | false | 0.448732;0.449202;0.448352;0.448035;0.450762 | 73984;73984;73984;73984;73984 | 1344;1344;1344;2368;6464 | 1024;1152;2304;2816;1152 |
241 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 21.333 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 256.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.439131;0.439084;0.440333;0.439366;0.439277 | 0;0;0;0;0 | 256;256;256;256;256 | 0;0;0;0;256 |
242 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 36.00 | 333032448 | 5185280.00 | 2311936.00 | 9.90 | 44.42 | 9250.90 | false | 0.099331;0.099295;0.099360;0.099072;0.099511 | 333032448;333032448;333032448;333032448;333032448 | 5162240;5264128;5131392;4970112;5262208 | 2312736;2307872;2321824;2315200;2303008 |
242 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.67 | 17862656 | 26986.67 | 5835146.67 | 20.20 | 3.05 | 1071.74 | true | 0.201705;0.202470;0.202987;0.201538;0.201285 | 17862656;17862656;17862656;17862656;17862656 | 26176;28992;27072;24640;27712 | 5815424;5852576;5833248;5840896;5831296 |
242 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.67 | 0 | 1405184.00 | 389578.67 | 43.50 | 0.00 | 0.00 | true | 0.433413;0.435921;0.434123;0.433841;0.437087 | 0;0;0;0;0 | 1405184;1405184;1405184;1405184;1405184 | 389632;395680;384064;395040;375200 |
242 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.67 | 1355648 | 259360.00 | 824437.33 | 2.30 | 1.25 | 127.09 | true | 0.023166;0.023153;0.023176;0.023173;0.023177 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259360 | 837600;815936;815584;819776;844512 |
242 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 672.00 | 29301.33 | 2.30 | 42.43 | 254.37 | false | 0.022975;0.022975;0.022975;0.022977;0.022974 | 1271872;1271872;1271872;1271872;1271872 | 28576;25728;31008;37632;28320 | 672;672;672;672;672 |
243 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 29.333 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 85.33 | 45.70 | 49.75 | 16.18 | false | 0.455858;0.459641;0.457510;0.456312;0.457931 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 384;256;0;0;0 |
244 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 21.667 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 96.00 | 256.00 | 45.00 | 0.00 | 0.00 | true | 0.451534;0.454685;0.449145;0.448596;0.449742 | 0;0;0;0;0 | 96;96;96;96;96 | 0;256;1792;256;256 |
245 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 174.667 | 295936 | 1901568 | 172271360 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 79.33 | 256975104 | 5824.00 | 237696.00 | 3.10 | 1055.25 | 3239.20 | false | 0.031248;0.031248;0.031249;0.031249;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 5568;6336;5568;12480;5568 | 239136;234592;237120;240096;236832 |
245 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 174.667 | 295936 | 1901568 | 172271360 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1605888.00 | 596544.00 | 43.80 | 0.00 | 0.00 | true | 0.437319;0.438295;0.439732;0.433240;0.438855 | 0;0;0;0;0 | 1605888;1605888;1605888;1605888;1605888 | 594144;604736;595808;599680;592608 |
246 | InceptionV4/InceptionV4/Mixed_6e/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 29.333 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1024.00 | 30496.00 | 44.40 | 2.35 | 18.50 | true | 0.444666;0.444306;0.443843;0.444183;0.446306 | 73984;73984;73984;73984;73984 | 32992;28320;33728;26752;30176 | 1024;1024;1024;1024;1024 |
248 | InceptionV4/InceptionV4/Mixed_6e/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 25 | 1479936 | 0 | 172086528 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 131797.33 | 61.50 | 0.00 | 0.00 | true | 0.614623;0.615378;0.616009;0.614241;0.616070 | 0;0;0;0;0 | 0;0;0;0;0 | 131712;131584;131968;131712;131968 |
249 | InceptionV4/InceptionV4/Mixed_6f/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 47.667 | 1183744 | 1183744 | 173270272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 13.67 | 5969482 | 4096.00 | 1048640.00 | 52.60 | 5.67 | 436.78 | true | 0.526063;0.527689;0.525312;0.525850;0.525384 | 5969482;5969482;5969482;5969482;5969482 | 4096;4096;4096;8448;4096 | 1045632;1048800;1048320;1048800;1049088 |
250 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 142.333 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 256.00 | 354410.67 | 3.10 | 354.94 | 2375.18 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;2560;256;0;512 | 361632;358912;350560;353632;350688 |
250 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 142.333 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 7.33 | 0 | 786496.00 | 545216.00 | 43.50 | 0.00 | 0.00 | true | 0.436054;0.435272;0.430738;0.438672;0.432923 | 0;0;0;0;0 | 543008;545952;546336;551072;543360 | 786496;786496;786496;786496;786496 |
251 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 139.667 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 0.00 | 122912.00 | 3.10 | 1024.18 | 2375.18 | false | 0.031246;0.031247;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 126304;123392;122656;122464;122688 | 0;0;0;0;0 |
251 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 139.667 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 101077.33 | 43.60 | 0.00 | 0.00 | true | 0.436540;0.432049;0.437705;0.434663;0.437765 | 0;0;0;0;0 | 786432;786432;786432;786688;786432 | 97376;100864;101280;101088;101600 |
252 | InceptionV4/InceptionV4/Mixed_6f/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 168.667 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 78.00 | 251769216 | 0.00 | 358400.00 | 4.80 | 702.48 | 3227.81 | false | 0.047561;0.047668;0.047203;0.047997;0.047590 | 251769216;251769216;251769216;251769216;251769216 | 0;0;4096;0;0 | 364416;351040;361632;355200;358368 |
252 | InceptionV4/InceptionV4/Mixed_6f/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 168.667 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1572864.00 | 1231744.00 | 44.10 | 0.00 | 0.00 | true | 0.451555;0.438155;0.434358;0.440466;0.445166 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 1225728;1241280;1230880;1233888;1230464 |
253 | InceptionV4/InceptionV4/Mixed_6f/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 154.667 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.33 | 83923072 | 764160.00 | 275605.33 | 3.10 | 80.71 | 1246.39 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 764032;761088;764544;763904;765568 | 280256;277952;270784;276544;272320 |
253 | InceptionV4/InceptionV4/Mixed_6f/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 154.667 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.33 | 0 | 524288.00 | 195232.00 | 42.60 | 0.00 | 0.00 | true | 0.425764;0.427914;0.425928;0.423811;0.425329 | 0;0;0;0;0 | 524288;524288;530688;524288;524288 | 186048;193152;198976;196032;196512 |
254 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 29 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 223232.00 | 15061.33 | 45.70 | 0.23 | 13.87 | true | 0.456748;0.457104;0.460030;0.457801;0.457063 | 55488;55488;55488;55488;55488 | 223232;223232;223232;223232;223232 | 15104;15104;14976;14976;15104 |
255 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 22.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 55488 | 207872.00 | 34741.33 | 44.80 | 0.23 | 12.81 | true | 0.446752;0.447816;0.448612;0.446445;0.448136 | 55488;55488;55488;55488;55488 | 207872;207872;207872;207872;207872 | 34720;34816;33792;34688;34944 |
256 | InceptionV4/InceptionV4/Mixed_6f/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 21.667 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 349760.00 | 51.80 | 0.32 | 27.74 | true | 0.518690;0.518783;0.517445;0.517888;0.518078 | 110976;110976;110976;110976;110976 | 1536;1536;1536;1536;1536 | 356288;341312;353344;354624;341312 |
257 | InceptionV4/InceptionV4/Mixed_6f/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 20.333 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 512.00 | 17536.00 | 43.80 | 2.05 | 9.25 | true | 0.434685;0.435317;0.439193;0.440111;0.438883 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 17280;11904;16896;18432;21760 |
258 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 21.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 43306.67 | 44.00 | 0.00 | 0.00 | true | 0.439557;0.439534;0.440124;0.440381;0.439586 | 0;0;0;0;0 | 256;256;256;256;256 | 43392;43264;43520;43264;43136 |
259 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 19 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 2048.00 | 43.60 | 0.00 | 0.00 | true | 0.435657;0.436302;0.435885;0.435976;0.435362 | 0;0;0;0;0 | 1280;896;2560;2304;2560 | 0;0;0;0;0 |
260 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.00 | 244930560 | 4085.33 | 1107754.67 | 8.70 | 220.29 | 8164.35 | false | 0.086681;0.086440;0.086810;0.086718;0.086793 | 244930560;244930560;244930560;244930560;244930560 | 3488;2464;4768;5024;4000 | 1111520;1109248;1102496;1114208;1074816 |
260 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 13.33 | 13123584 | 2581.33 | 3841013.33 | 16.80 | 3.41 | 984.29 | true | 0.169334;0.167946;0.167529;0.168983;0.168492 | 13123584;13123584;13123584;13123584;13123584 | 2496;2496;2752;2368;2880 | 3848640;3840320;3834080;3812256;3857600 |
260 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 222752.00 | 808736.00 | 2.00 | 1.13 | 105.63 | true | 0.019990;0.019987;0.019982;0.019999;0.019968 | 1161984;1161984;1161984;1161984;1161984 | 801376;806112;814432;821088;805664 | 223008;222880;222496;222752;222624 |
260 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 262858.67 | 43.50 | 0.00 | 0.00 | true | 0.434917;0.435206;0.435992;0.432631;0.435093 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1032192 | 263616;254912;261888;275392;263072 |
260 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1090176 | 1386.67 | 264608.00 | 2.00 | 4.10 | 218.04 | true | 0.019942;0.019933;0.019935;0.019928;0.019916 | 1090176;1090176;1090176;1090176;1090176 | 6592;1344;1344;1472;1344 | 265632;264960;268192;263232;261344 |
261 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.667 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 2115274.67 | 1804586.67 | 10.00 | 72.90 | 9217.82 | false | 0.100077;0.099930;0.099910;0.099760;0.100043 | 285752320;285752320;285752320;285752320;285752320 | 2087776;2192480;2067488;2190560;2012704 | 1823648;1802784;1804096;1806880;1772800 |
261 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.667 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.67 | 15310848 | 5642.67 | 4866538.67 | 18.50 | 3.14 | 977.27 | true | 0.183887;0.187601;0.185892;0.184467;0.185155 | 15310848;15310848;15310848;15310848;15310848 | 4854080;4851392;4867776;4887328;4877760 | 5216;6240;5472;6496;4832 |
261 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.667 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 1161984 | 228768.00 | 613045.33 | 2.00 | 1.38 | 83.00 | true | 0.019898;0.019899;0.019898;0.019897;0.019898 | 1161984;1161984;1161984;1161984;1161984 | 233632;228768;228768;228768;228768 | 614976;614240;611168;606048;613728 |
261 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.667 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204672.00 | 1058122.67 | 44.20 | 0.00 | 0.00 | true | 0.439993;0.449698;0.443156;0.440334;0.441945 | 0;0;0;0;0 | 1204672;1204672;1204672;1204672;1204672 | 1055520;1063616;1068928;1053568;1055232 |
261 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.667 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.33 | 1271872 | 7765.33 | 107648.00 | 2.30 | 11.02 | 200.83 | true | 0.023091;0.023087;0.023095;0.023089;0.023095 | 1271872;1271872;1271872;1271872;1271872 | 7968;7872;6944;12704;7456 | 107424;96256;111232;104288;117344 |
262 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 30.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 55488 | 223296.00 | 40533.33 | 45.80 | 0.21 | 12.81 | true | 0.458749;0.457986;0.458060;0.460772;0.457627 | 55488;55488;55488;55488;55488 | 223296;223296;223296;223296;223296 | 40192;38016;40704;40704;43584 |
263 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 22 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 1194.67 | 45.30 | 26.85 | 16.18 | false | 0.454883;0.454290;0.452795;0.452327;0.452945 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 896;1024;1792;1280;1280 |
264 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 21.333 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 31370.67 | 44.00 | 0.00 | 0.00 | true | 0.439713;0.439776;0.439818;0.439657;0.439357 | 0;0;0;0;0 | 31616;30848;31872;31616;30880 | 256;256;256;256;256 |
265 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 19.667 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 512.00 | 43.80 | 0.00 | 0.00 | true | 0.437972;0.437868;0.437546;0.437862;0.437999 | 0;0;0;0;0 | 0;0;2304;0;0 | 256;512;768;512;512 |
266 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 1769418.67 | 1912661.33 | 10.00 | 77.61 | 9217.82 | false | 0.099937;0.100024;0.099955;0.099954;0.099818 | 285752320;285752320;285752320;285752320;285752320 | 1912832;1936512;1905952;1912960;1912192 | 1764960;1734880;1764768;1778528;1804448 |
266 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.67 | 15310848 | 5568.00 | 4879466.67 | 18.30 | 3.13 | 977.27 | true | 0.182632;0.181439;0.184090;0.183433;0.180546 | 15310848;15310848;15310848;15310848;15310848 | 5504;5632;5568;5248;6592 | 4868032;4886688;4884320;4873472;4880608 |
266 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 222368.00 | 644330.67 | 2.00 | 1.34 | 77.47 | true | 0.019958;0.019963;0.019965;0.019952;0.019965 | 1161984;1161984;1161984;1161984;1161984 | 222368;222368;222368;222368;222368 | 648320;651360;650976;633696;630848 |
266 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204480.00 | 858474.67 | 43.90 | 0.00 | 0.00 | true | 0.442146;0.439424;0.438883;0.438036;0.438897 | 0;0;0;0;0 | 1204480;1204480;1204480;1204480;1204480 | 854848;851680;852288;868288;871584 |
266 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 2474.67 | 96853.33 | 2.30 | 12.80 | 254.37 | true | 0.022998;0.022997;0.022999;0.022998;0.022996 | 1271872;1271872;1271872;1271872;1271872 | 2464;2368;2592;1568;3872 | 96384;98848;102464;91904;95328 |
267 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 204.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 36.00 | 380608512 | 7055701.33 | 2697706.67 | 11.10 | 39.02 | 10572.46 | false | 0.110816;0.110449;0.110782;0.110547;0.110615 | 380608512;380608512;380608512;380608512;380608512 | 6997248;7042688;7065216;7059200;7097344 | 2668352;2693920;2706336;2698592;2700608 |
267 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 204.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 25.00 | 20414464 | 98613.33 | 7107445.33 | 20.10 | 2.83 | 816.58 | true | 0.200389;0.201236;0.198360;0.202309;0.200639 | 20414464;20414464;20414464;20414464;20414464 | 95264;97184;102112;98720;99936 | 7107008;7098848;7116480;7122976;7094592 |
267 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 204.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1606080.00 | 1028224.00 | 43.90 | 0.00 | 0.00 | true | 0.438914;0.441976;0.440880;0.438155;0.435624 | 0;0;0;0;0 | 1606080;1606080;1606080;1606080;1606080 | 1032576;1012320;1019904;1043776;1032192 |
267 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 204.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259872.00 | 656586.67 | 2.30 | 1.48 | 123.24 | true | 0.023178;0.023185;0.023185;0.023176;0.023171 | 1355648;1355648;1355648;1355648;1355648 | 259872;259872;259872;259872;259872 | 656192;660800;652768;647456;669280 |
267 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 204.667 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1453568 | 5280.00 | 42954.67 | 2.60 | 30.14 | 242.26 | false | 0.026236;0.026245;0.026238;0.026261;0.026234 | 1453568;1453568;1453568;1453568;1453568 | 5536;5280;5280;5280;5280 | 46496;43808;41248;43168;41888 |
268 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 29.667 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 260160.00 | 19328.00 | 46.00 | 0.23 | 16.18 | true | 0.460575;0.459631;0.460023;0.459296;0.462211 | 64736;64736;64736;64736;64736 | 260160;260160;260160;260160;260160 | 19968;17408;18432;21120;19584 |
269 | InceptionV4/InceptionV4/Mixed_6f/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 23.667 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 73984 | 1344.00 | 2346.67 | 44.90 | 20.05 | 17.07 | false | 0.448977;0.449514;0.448611;0.448603;0.449724 | 73984;73984;73984;73984;73984 | 896;512;1024;5248;5120 | 1344;1344;1344;6976;1344 |
270 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 20.667 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.439802;0.439449;0.439124;0.439067;0.439967 | 0;0;0;0;0 | 256;256;256;256;2048 | 0;0;0;0;0 |
271 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.667 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 35.33 | 333032448 | 5179168.00 | 2351840.00 | 9.90 | 44.22 | 9425.54 | false | 0.099460;0.099209;0.099192;0.099436;0.099347 | 333032448;333032448;333032448;333032448;333032448 | 2381024;2334400;2380192;2337472;2337856 | 5147296;5225888;5124000;5217696;5172512 |
271 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.667 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.67 | 17862656 | 26090.67 | 5812554.67 | 20.10 | 3.06 | 1071.74 | true | 0.201902;0.202506;0.201073;0.201183;0.199800 | 17862656;17862656;17862656;17862656;17862656 | 26048;26304;27712;25280;25920 | 5796128;5825056;5809024;5815168;5813472 |
271 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.667 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1405184.00 | 502400.00 | 43.80 | 0.00 | 0.00 | true | 0.440875;0.438786;0.437322;0.436790;0.438311 | 0;0;0;0;0 | 513760;505184;490496;495488;506528 | 1405184;1405184;1405184;1405184;1405184 |
271 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.667 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.67 | 1355648 | 259360.00 | 765098.67 | 2.30 | 1.32 | 127.09 | true | 0.023165;0.023169;0.023167;0.023171;0.023192 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259872 | 763808;770752;751040;767424;764064 |
271 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 191.667 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.67 | 1271872 | 906.67 | 16672.00 | 2.30 | 72.35 | 272.52 | false | 0.022972;0.022973;0.022976;0.022972;0.022972 | 1271872;1271872;1271872;1271872;1271872 | 896;800;928;1056;896 | 17312;16576;16960;16480;15072 |
272 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 28.333 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 42.67 | 45.70 | 51.43 | 16.18 | false | 0.459122;0.456253;0.455760;0.458279;0.456664 | 64736;64736;64736;64736;64736 | 1216;1216;1216;1216;1216 | 256;128;0;0;0 |
273 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 20.667 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 96.00 | 42.67 | 44.90 | 0.00 | 0.00 | true | 0.448839;0.448555;0.448492;0.453044;0.448533 | 0;0;0;0;0 | 96;96;96;96;96 | 0;0;128;0;256 |
274 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 173.333 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 79.67 | 256975104 | 5568.00 | 163413.33 | 3.10 | 1520.73 | 3225.62 | false | 0.031248;0.031249;0.031248;0.031248;0.031249 | 256975104;256975104;256975104;256975104;256975104 | 7872;5312;5568;5568;5568 | 165344;163104;161792;161408;166720 |
274 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 173.333 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1605888.00 | 892789.33 | 43.90 | 0.00 | 0.00 | true | 0.438938;0.439091;0.441787;0.436136;0.438596 | 0;0;0;0;0 | 889760;894880;892576;903072;890912 | 1605888;1605888;1605888;1605888;1605888 |
275 | InceptionV4/InceptionV4/Mixed_6f/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 28.667 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1024.00 | 11146.67 | 44.40 | 6.08 | 18.50 | true | 0.444105;0.444122;0.444527;0.443842;0.445213 | 73984;73984;73984;73984;73984 | 1024;1024;1024;1024;6144 | 10880;10784;11776;11776;9216 |
277 | InceptionV4/InceptionV4/Mixed_6f/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 25.333 | 1627648 | 0 | 172234240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 34389.33 | 61.50 | 0.00 | 0.00 | true | 0.616245;0.614196;0.614443;0.615028;0.615141 | 0;0;0;0;0 | 0;0;0;0;0 | 31872;33920;34688;37888;34560 |
278 | InceptionV4/InceptionV4/Mixed_6g/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 48 | 1479936 | 1479936 | 173714176 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 14.00 | 5965762 | 4138.67 | 666357.33 | 52.80 | 8.90 | 426.13 | true | 0.528152;0.529212;0.528302;0.528472;0.527281 | 5965762;5965762;5965762;5965762;5965762 | 4224;4160;4096;4096;4160 | 667072;665376;665504;667104;666496 |
279 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 143.667 | 221952 | 1008384 | 173936128 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 0.00 | 423872.00 | 3.10 | 296.99 | 2375.18 | false | 0.031246;0.031247;0.031246;0.031246;0.031247 | 125884608;125884608;125884608;125884608;125884608 | 0;256;0;0;0 | 425792;423424;417792;422400;427328 |
279 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 143.667 | 221952 | 1008384 | 173936128 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 585738.67 | 43.60 | 0.00 | 0.00 | true | 0.435109;0.439332;0.431046;0.433756;0.438931 | 0;0;0;0;0 | 584000;585792;591648;587424;582144 | 786432;786432;786432;786432;789760 |
280 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 138 | 221952 | 1008384 | 174158080 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 0.00 | 106474.67 | 3.10 | 1182.30 | 2375.18 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 107520;105568;108160;103744;106336 |
280 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 138 | 221952 | 1008384 | 174158080 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 117322.67 | 44.00 | 0.00 | 0.00 | true | 0.440135;0.440694;0.440425;0.438190;0.437892 | 0;0;0;0;0 | 786432;786432;786432;788480;786432 | 117216;117760;115712;123360;116992 |
281 | InceptionV4/InceptionV4/Mixed_6g/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 171.667 | 443904 | 2016768 | 174601984 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 78.00 | 251769216 | 16938.67 | 518912.00 | 4.80 | 469.85 | 3227.81 | false | 0.047354;0.047581;0.047893;0.047871;0.047758 | 251769216;251769216;251769216;251769216;251769216 | 17280;16768;17280;16640;16768 | 507968;522368;515968;518400;530144 |
281 | InceptionV4/InceptionV4/Mixed_6g/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 171.667 | 443904 | 2016768 | 174601984 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.67 | 0 | 1572928.00 | 1172469.33 | 43.90 | 0.00 | 0.00 | true | 0.440047;0.436101;0.440642;0.441585;0.436336 | 0;0;0;0;0 | 1185312;1173120;1171680;1172608;1162944 | 1572928;1572928;1572928;1572928;1572864 |
282 | InceptionV4/InceptionV4/Mixed_6g/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 151 | 147968 | 672256 | 173122304 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 68.00 | 83923072 | 811136.00 | 219957.33 | 3.10 | 81.39 | 1234.16 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 810240;810496;811904;814848;811008 | 216160;221568;219424;219392;221056 |
282 | InceptionV4/InceptionV4/Mixed_6g/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 151 | 147968 | 672256 | 173122304 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.67 | 0 | 524373.33 | 187210.67 | 43.00 | 0.00 | 0.00 | true | 0.435489;0.425514;0.432181;0.428405;0.428192 | 0;0;0;0;0 | 531456;524288;524544;524288;524288 | 189952;185664;186016;189248;186368 |
283 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 29 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 55488 | 223232.00 | 512.00 | 45.80 | 0.25 | 11.10 | true | 0.456988;0.458144;0.456336;0.457516;0.458608 | 55488;55488;55488;55488;55488 | 223232;223232;223232;223232;223232 | 512;640;512;512;512 |
284 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 24 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 55488 | 194816.00 | 95392.00 | 44.60 | 0.19 | 11.89 | true | 0.445907;0.445619;0.445763;0.447622;0.447029 | 55488;55488;55488;55488;55488 | 194304;195840;194304;194304;200960 | 95776;92320;95648;94752;109600 |
285 | InceptionV4/InceptionV4/Mixed_6g/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 22 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1536.00 | 254634.67 | 51.80 | 0.43 | 27.74 | true | 0.517350;0.517974;0.516573;0.518318;0.518186 | 110976;110976;110976;110976;110976 | 1536;1536;1536;1536;1536 | 254720;248960;258560;256640;252544 |
286 | InceptionV4/InceptionV4/Mixed_6g/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 22.333 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 512.00 | 11648.00 | 43.60 | 3.04 | 9.25 | true | 0.435395;0.434949;0.436206;0.436963;0.433254 | 36992;36992;36992;36992;36992 | 512;768;512;512;512 | 16640;11776;6528;18304;6400 |
287 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 21.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 256.00 | 87978.67 | 43.90 | 0.00 | 0.00 | true | 0.439570;0.439368;0.438988;0.438749;0.439535 | 0;0;0;0;0 | 93056;85504;93440;85376;77056 | 256;256;256;256;256 |
288 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 19.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 256.00 | 43.60 | 0.00 | 0.00 | true | 0.435536;0.436076;0.435731;0.435588;0.435336 | 0;0;0;0;0 | 0;0;0;0;0 | 256;256;256;256;256 |
289 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.00 | 244930560 | 4192.00 | 1042709.33 | 8.70 | 233.96 | 8164.35 | false | 0.086833;0.086801;0.086663;0.086540;0.086793 | 244930560;244930560;244930560;244930560;244930560 | 3808;4832;4576;4192;3424 | 1026560;1036992;1049984;1049920;1041216 |
289 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 13.67 | 13123584 | 2453.33 | 3877504.00 | 16.80 | 3.38 | 960.24 | true | 0.168112;0.168623;0.167086;0.166461;0.167962 | 13123584;13123584;13123584;13123584;13123584 | 2368;2560;2432;2560;2368 | 3879456;3913568;3870336;3870752;3882304 |
289 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.67 | 1161984 | 222965.33 | 745312.00 | 2.00 | 1.20 | 108.93 | true | 0.019989;0.019966;0.019991;0.019989;0.019973 | 1161984;1161984;1161984;1161984;1161984 | 223136;223008;222752;222880;223008 | 746048;739968;756672;749920;736224 |
289 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 250848.00 | 43.30 | 0.00 | 0.00 | true | 0.432607;0.431315;0.432075;0.435354;0.433522 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1032192 | 251744;244832;259136;254560;246240 |
289 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 186.333 | 221952 | 7381504 | 171864320 | GPU_0_bfc | 7159552 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.67 | 1090176 | 8021.33 | 287808.00 | 2.00 | 3.69 | 233.59 | true | 0.019933;0.019936;0.019949;0.019946;0.019936 | 1090176;1090176;1090176;1090176;1090176 | 301568;289088;288512;285824;285760 | 13184;7936;8384;7616;7744 |
290 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 198 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 1866869.33 | 1824864.00 | 10.00 | 77.40 | 9217.82 | false | 0.100096;0.099894;0.099875;0.099796;0.099852 | 285752320;285752320;285752320;285752320;285752320 | 1890208;1805856;1848992;1875744;1875872 | 1832128;1818560;1829792;1826240;1796416 |
290 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 198 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 7242.67 | 4843381.33 | 18.50 | 3.16 | 956.93 | true | 0.184536;0.184582;0.185060;0.184194;0.185330 | 15310848;15310848;15310848;15310848;15310848 | 7904;7200;6496;6624;8608 | 4837792;4817792;4843040;4849312;4860672 |
290 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 198 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 228768.00 | 666272.00 | 2.00 | 1.30 | 77.47 | true | 0.019897;0.019898;0.019898;0.019898;0.019898 | 1161984;1161984;1161984;1161984;1161984 | 228768;228768;228768;228768;228768 | 666208;672928;672320;655136;660288 |
290 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 198 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204672.00 | 1100640.00 | 44.60 | 0.00 | 0.00 | true | 0.446357;0.443994;0.446729;0.444582;0.450180 | 0;0;0;0;0 | 1204672;1204672;1204672;1204672;1204672 | 1107392;1100800;1091360;1111296;1093728 |
290 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 198 | 259072 | 8291072 | 171901440 | GPU_0_bfc | 8032000 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.33 | 1271872 | 13322.67 | 70656.00 | 2.30 | 15.15 | 200.83 | true | 0.023087;0.023086;0.023093;0.023096;0.023093 | 1271872;1271872;1271872;1271872;1271872 | 14080;13088;13632;11776;13248 | 69504;71232;69920;70816;78368 |
291 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 30.667 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 223296.00 | 19114.67 | 45.80 | 0.23 | 13.87 | true | 0.458326;0.457901;0.458776;0.458308;0.458132 | 55488;55488;55488;55488;55488 | 19712;21632;18688;18944;16000 | 223296;223296;223296;228416;223296 |
292 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 22 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 26325.33 | 45.40 | 2.35 | 16.18 | true | 0.454163;0.453812;0.452904;0.454325;0.453570 | 64736;64736;64736;64736;64736 | 26016;22560;25856;27104;27744 | 1216;1216;1216;1216;1216 |
293 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 21.667 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 256.00 | 2954.67 | 43.90 | 0.00 | 0.00 | true | 0.439028;0.439406;0.439020;0.439464;0.439204 | 0;0;0;0;0 | 4128;2848;2560;3072;2944 | 256;256;256;256;256 |
294 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 19.333 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 512.00 | 43.80 | 0.00 | 0.00 | true | 0.437658;0.437746;0.437571;0.437679;0.437614 | 0;0;0;0;0 | 0;0;0;0;0 | 512;256;640;512;512 |
295 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.67 | 285752320 | 1509034.67 | 1883264.00 | 10.00 | 84.24 | 9317.91 | false | 0.099747;0.099896;0.100064;0.099887;0.099925 | 285752320;285752320;285752320;285752320;285752320 | 1500416;1429888;1531776;1494912;1535744 | 1877280;1908064;1894304;1878208;1873024 |
295 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.33 | 15310848 | 8341.33 | 4943125.33 | 17.90 | 3.09 | 998.56 | true | 0.178857;0.178978;0.180526;0.178612;0.181235 | 15310848;15310848;15310848;15310848;15310848 | 7872;8896;7360;8256;10048 | 4946816;4952032;4918592;4930528;4955424 |
295 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 222368.00 | 695882.67 | 2.00 | 1.27 | 77.47 | true | 0.019965;0.019998;0.019955;0.019983;0.019965 | 1161984;1161984;1161984;1161984;1161984 | 697568;681760;701120;699168;690912 | 222368;222368;222368;222368;222368 |
295 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204480.00 | 817045.33 | 44.10 | 0.00 | 0.00 | true | 0.442007;0.440755;0.441736;0.439200;0.440602 | 0;0;0;0;0 | 1204480;1204480;1204480;1206784;1204480 | 806752;815456;817152;818528;821088 |
295 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 259072 | 8550400 | 171938560 | GPU_0_bfc | 8291328 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 23957.33 | 83338.67 | 2.30 | 11.85 | 254.37 | true | 0.022986;0.022993;0.022992;0.022985;0.023021 | 1271872;1271872;1271872;1271872;1271872 | 24608;25600;23040;21216;24224 | 82752;93568;83200;84064;76256 |
296 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 36.00 | 380608512 | 6951925.33 | 2706250.67 | 11.10 | 39.41 | 10572.46 | false | 0.110581;0.110762;0.110406;0.110344;0.110616 | 380608512;380608512;380608512;380608512;380608512 | 6964896;6962592;6928288;6923808;7031328 | 2720640;2703136;2717920;2697696;2695776 |
296 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 25.00 | 20414464 | 66549.33 | 6898133.33 | 20.30 | 2.93 | 816.58 | true | 0.203068;0.204365;0.203282;0.200496;0.203972 | 20414464;20414464;20414464;20414464;20414464 | 66208;66848;66272;66528;68448 | 6898784;6896320;6890048;6930592;6899296 |
296 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1606080.00 | 931968.00 | 43.90 | 0.00 | 0.00 | true | 0.439922;0.436091;0.437599;0.438815;0.439869 | 0;0;0;0;0 | 1606592;1606080;1606080;1606080;1606080 | 921216;907552;937792;940832;936896 |
296 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259872.00 | 757621.33 | 2.30 | 1.33 | 123.24 | true | 0.023190;0.023178;0.023165;0.023189;0.023168 | 1355648;1355648;1355648;1355648;1355648 | 747808;756384;768672;745632;769440 | 259872;259872;259872;259872;259872 |
296 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 202 | 295936 | 10810624 | 172012544 | GPU_0_bfc | 10514688 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1453568 | 5365.33 | 62336.00 | 2.60 | 21.47 | 242.26 | false | 0.026259;0.026270;0.026260;0.026260;0.026259 | 1453568;1453568;1453568;1453568;1453568 | 5536;5280;5280;5280;5536 | 54528;65024;61312;67456;60672 |
297 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 30 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 64736 | 260160.00 | 30677.33 | 46.00 | 0.22 | 12.95 | true | 0.460200;0.460207;0.460221;0.458858;0.460014 | 64736;64736;64736;64736;64736 | 29056;30208;32512;29312;33152 | 260160;260160;260160;260160;260160 |
298 | InceptionV4/InceptionV4/Mixed_6g/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 22 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1344.00 | 2176.00 | 44.80 | 21.02 | 18.50 | false | 0.450080;0.448720;0.447994;0.447823;0.447732 | 73984;73984;73984;73984;73984 | 1344;1344;1344;1600;1344 | 1536;1664;1024;3328;3744 |
299 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 21 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 256.00 | 0.00 | 44.00 | 0.00 | 0.00 | true | 0.439983;0.439844;0.439924;0.439785;0.439873 | 0;0;0;0;0 | 256;256;256;256;5376 | 0;256;0;0;0 |
300 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 35.33 | 333032448 | 5115221.33 | 2311818.67 | 9.90 | 44.84 | 9425.54 | false | 0.099386;0.099195;0.099377;0.099101;0.099187 | 333032448;333032448;333032448;333032448;333032448 | 5100032;5012608;5325312;5068032;5177600 | 2320992;2323584;2291872;2291328;2322592 |
300 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 17.00 | 17862656 | 27541.33 | 5844032.00 | 20.10 | 3.04 | 1050.74 | true | 0.201837;0.200449;0.199577;0.200743;0.202826 | 17862656;17862656;17862656;17862656;17862656 | 5796896;5826912;5869568;5883744;5835616 | 22848;25664;28864;28096;28992 |
300 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1405184.00 | 391370.67 | 43.60 | 0.00 | 0.00 | true | 0.436369;0.436763;0.435966;0.434380;0.434846 | 0;0;0;0;0 | 393504;392608;388000;386976;393600 | 1405184;1405184;1405184;1405184;1405184 |
300 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1355648 | 259360.00 | 822986.67 | 2.30 | 1.25 | 123.24 | true | 0.023184;0.023175;0.023164;0.023168;0.023175 | 1355648;1355648;1355648;1355648;1355648 | 259360;259360;259360;259360;259360 | 849376;827840;812096;794048;829024 |
300 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 9821184 | 172234496 | GPU_0_bfc | 9340160 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 672.00 | 31466.67 | 2.30 | 39.57 | 254.37 | false | 0.022974;0.022972;0.022973;0.022974;0.022972 | 1271872;1271872;1271872;1271872;1271872 | 5792;672;672;672;672 | 35584;33696;26880;28928;31776 |
301 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 29 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 64736 | 1216.00 | 640.00 | 45.80 | 34.88 | 16.18 | false | 0.457663;0.458532;0.457624;0.456578;0.461356 | 64736;64736;64736;64736;64736 | 5120;0;1664;0;256 | 1216;1216;1216;1216;1216 |
302 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 23.667 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 96.00 | 85.33 | 44.90 | 0.00 | 0.00 | true | 0.449190;0.447881;0.451207;0.448072;0.449633 | 0;0;0;0;0 | 96;96;96;96;96 | 256;1152;0;0;0 |
303 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 174 | 295936 | 1901568 | 172271360 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 80.00 | 256975104 | 5482.67 | 236469.33 | 3.10 | 1062.09 | 3212.19 | false | 0.031248;0.031249;0.031249;0.031249;0.031248 | 256975104;256975104;256975104;256975104;256975104 | 237984;232864;234720;236704;238432 | 5568;5312;5568;5568;5312 |
303 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 174 | 295936 | 1901568 | 172271360 | GPU_0_bfc | 1605632 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1605888.00 | 610186.67 | 43.60 | 0.00 | 0.00 | true | 0.434584;0.439458;0.437156;0.435737;0.436167 | 0;0;0;0;0 | 1605888;1605888;1607936;1605888;1605888 | 611168;606336;613056;618560;603296 |
304 | InceptionV4/InceptionV4/Mixed_6g/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 28.333 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1024.00 | 30762.67 | 44.40 | 2.33 | 18.50 | true | 0.443545;0.445721;0.444498;0.444140;0.444070 | 73984;73984;73984;73984;73984 | 1024;1024;1024;1024;1024 | 28896;31296;32928;31328;29664 |
306 | InceptionV4/InceptionV4/Mixed_6g/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 25 | 1479936 | 0 | 172086528 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 131797.33 | 61.40 | 0.00 | 0.00 | true | 0.616439;0.612725;0.614697;0.614349;0.613738 | 0;0;0;0;0 | 0;0;0;5120;0 | 131840;131840;131712;131200;131968 |
307 | InceptionV4/InceptionV4/Mixed_6h/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1024 17 17]] | 49 | 1183744 | 1183744 | 173270272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 14.00 | 5692057 | 4096.00 | 1046197.33 | 53.10 | 5.42 | 406.58 | true | 0.530657;0.531317;0.531884;0.530092;0.530001 | 5692057;5692057;5692057;5692057;5692057 | 4096;4096;5376;4096;4096 | 1046176;1046464;1046400;1046016;1045376 |
308 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 85.33 | 368074.67 | 3.10 | 341.93 | 2375.18 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;256;0;0;256 | 374368;369984;365376;367168;367072 |
308 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141 | 221952 | 1008384 | 173492224 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 535168.00 | 43.30 | 0.00 | 0.00 | true | 0.436325;0.423003;0.430662;0.431370;0.436543 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 530368;534528;533600;537696;537376 |
309 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141.667 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 0.00 | 114773.33 | 3.10 | 1096.81 | 2375.18 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 113984;120608;108512;115840;114496 |
309 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 141.667 | 221952 | 1008384 | 173714176 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 107189.33 | 43.60 | 0.00 | 0.00 | true | 0.435026;0.437274;0.438596;0.435484;0.431427 | 0;0;0;0;0 | 786432;786432;786432;786432;786432 | 108192;101152;113664;106176;107200 |
310 | InceptionV4/InceptionV4/Mixed_6h/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 169 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 78.00 | 251769216 | 0.00 | 383488.00 | 4.80 | 656.52 | 3227.81 | false | 0.047554;0.047748;0.047491;0.047853;0.047493 | 251769216;251769216;251769216;251769216;251769216 | 0;0;0;128;0 | 390016;385376;377504;380896;384192 |
310 | InceptionV4/InceptionV4/Mixed_6h/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 17 17]] | 169 | 443904 | 2016768 | 174158080 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 12.00 | 0 | 1572885.33 | 1193621.33 | 44.90 | 0.00 | 0.00 | true | 0.448366;0.451057;0.446861;0.450878;0.448733 | 0;0;0;0;0 | 1572864;1572864;1572928;1572928;1572864 | 1190912;1193184;1194592;1198464;1193088 |
311 | InceptionV4/InceptionV4/Mixed_6h/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 150 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 66.67 | 83923072 | 749141.33 | 268533.33 | 3.10 | 82.47 | 1258.84 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 83923072;83923072;83923072;83923072;83923072 | 746112;747648;750208;749568;750592 | 269088;269856;267424;269088;263456 |
311 | InceptionV4/InceptionV4/Mixed_6h/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 128 17 17]] | 150 | 147968 | 672256 | 172826112 | GPU_0_bfc | 524288 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 6.00 | 0 | 524288.00 | 199669.33 | 43.00 | 0.00 | 0.00 | true | 0.428198;0.430016;0.430005;0.429563;0.430176 | 0;0;0;0;0 | 199456;198688;200512;199040;205056 | 526848;524288;524288;524288;524288 |
312 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 29 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 55488 | 223232.00 | 15061.33 | 45.80 | 0.23 | 12.81 | true | 0.458399;0.459332;0.457272;0.456784;0.457111 | 55488;55488;55488;55488;55488 | 223232;223232;223232;223232;223232 | 14976;15104;15104;14976;15104 |
313 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 22.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 207872.00 | 36192.00 | 44.80 | 0.23 | 13.87 | true | 0.446462;0.448599;0.448437;0.450677;0.447001 | 55488;55488;55488;55488;55488 | 207872;207872;207872;207872;207872 | 36352;36128;36352;36096;35712 |
314 | InceptionV4/InceptionV4/Mixed_6h/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 17 17]] | 22 | 443904 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 110976 | 1877.33 | 350538.67 | 51.80 | 0.31 | 27.74 | true | 0.518176;0.517492;0.520889;0.517854;0.519377 | 110976;110976;110976;110976;110976 | 351584;349152;350880;353696;328288 | 1536;3584;1536;1536;2560 |
315 | InceptionV4/InceptionV4/Mixed_6h/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 128 17 17]] | 22 | 147968 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 36992 | 512.00 | 11818.67 | 43.50 | 3.00 | 9.25 | true | 0.433143;0.435262;0.435487;0.435366;0.435099 | 36992;36992;36992;36992;36992 | 512;512;512;512;512 | 15360;13440;6656;11648;10368 |
316 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20.333 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 256.00 | 43818.67 | 44.00 | 0.00 | 0.00 | true | 0.440123;0.440458;0.438974;0.440328;0.440653 | 0;0;0;0;0 | 256;256;256;1280;256 | 43776;43648;43904;43776;43904 |
317 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 20.667 | 221952 | 0 | 171642368 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 2218.67 | 43.60 | 0.00 | 0.00 | true | 0.435467;0.435506;0.435663;0.435716;0.436062 | 0;0;0;0;0 | 0;0;0;0;0 | 1792;2944;2688;1152;2176 |
318 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.667 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 30.00 | 244930560 | 3125.33 | 1091338.67 | 8.70 | 223.79 | 8164.35 | false | 0.086700;0.086531;0.086570;0.086465;0.086626 | 244930560;244930560;244930560;244930560;244930560 | 1090528;1058944;1112160;1113440;1071328 | 2720;2848;1568;3808;5408 |
318 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.667 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 14.00 | 13123584 | 2432.00 | 3831477.33 | 16.80 | 3.42 | 937.40 | true | 0.168705;0.167016;0.169690;0.169726;0.166910 | 13123584;13123584;13123584;13123584;13123584 | 2368;2368;2560;2368;2624 | 3828864;3850592;3814976;3803456;3865568 |
318 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.667 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 11.00 | 1161984 | 222837.33 | 800768.00 | 2.00 | 1.14 | 105.63 | true | 0.019991;0.019982;0.020005;0.019985;0.019971 | 1161984;1161984;1161984;1161984;1161984 | 223136;222240;222880;222880;222752 | 801056;790656;807872;811744;793376 |
318 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.667 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.00 | 0 | 1032192.00 | 287168.00 | 43.40 | 0.00 | 0.00 | true | 0.431387;0.433634;0.435176;0.432956;0.434559 | 0;0;0;0;0 | 1032192;1032192;1032192;1032192;1037312 | 283616;299008;284704;282752;293184 |
318 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 184.667 | 221952 | 7155968 | 171864320 | GPU_0_bfc | 6934016 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 4.33 | 1090176 | 1429.33 | 266357.33 | 2.00 | 4.07 | 251.60 | true | 0.019939;0.019956;0.019930;0.019936;0.019942 | 1090176;1090176;1090176;1090176;1090176 | 266944;270560;263968;264832;267296 | 1344;1600;1344;1344;1600 |
319 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193.667 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 2199328.00 | 1826656.00 | 10.00 | 70.98 | 9217.82 | false | 0.099522;0.099833;0.099895;0.099861;0.099478 | 285752320;285752320;285752320;285752320;285752320 | 2192992;2195040;2209952;2227040;2121312 | 1835360;1823712;1820896;1837344;1810688 |
319 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193.667 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.67 | 15310848 | 4277.33 | 4846624.00 | 18.50 | 3.16 | 977.27 | true | 0.184702;0.184319;0.185476;0.185877;0.185087 | 15310848;15310848;15310848;15310848;15310848 | 3936;4192;4448;4192;5088 | 4839616;4849536;4850720;4834272;4852224 |
319 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193.667 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 228768.00 | 609024.00 | 2.00 | 1.39 | 77.47 | true | 0.019898;0.019897;0.019898;0.019898;0.019898 | 1161984;1161984;1161984;1161984;1161984 | 228768;228768;228768;228768;228768 | 614592;602912;600896;614208;609952 |
319 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193.667 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204672.00 | 1065429.33 | 44.30 | 0.00 | 0.00 | true | 0.453142;0.443165;0.439247;0.441480;0.445812 | 0;0;0;0;0 | 1209792;1204672;1204672;1204672;1204672 | 1072864;1077376;1047584;1042976;1075840 |
319 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 193.667 | 259072 | 9327104 | 171901440 | GPU_0_bfc | 9068032 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1271872 | 6645.33 | 110346.67 | 2.30 | 10.87 | 211.98 | true | 0.023091;0.023091;0.023093;0.023091;0.023092 | 1271872;1271872;1271872;1271872;1271872 | 7904;6560;6560;6688;6688 | 110656;109664;110720;105792;114112 |
320 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 29 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 55488 | 223296.00 | 46688.00 | 45.90 | 0.21 | 11.10 | true | 0.458144;0.458490;0.463729;0.458582;0.459606 | 55488;55488;55488;55488;55488 | 223296;223296;223296;223296;223296 | 45952;46464;46240;47360;47648 |
321 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 22 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 64736 | 1216.00 | 1322.67 | 45.30 | 25.50 | 14.94 | false | 0.452833;0.453847;0.452937;0.452022;0.452442 | 64736;64736;64736;64736;64736 | 1536;1280;1280;1152;1408 | 1216;1216;1216;1216;1216 |
322 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0b_7x1/Relu | Relu | [[1 192 17 17]] | 21.667 | 221952 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 256.00 | 36736.00 | 44.00 | 0.00 | 0.00 | true | 0.440125;0.439751;0.439584;0.439756;0.439736 | 0;0;0;0;0 | 39552;34432;36608;38400;35200 | 256;512;256;256;256 |
323 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 224 17 17]] | 20.333 | 259072 | 0 | 171679488 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 469.33 | 43.80 | 0.00 | 0.00 | true | 0.438677;0.437960;0.437939;0.437681;0.437806 | 0;0;0;0;0 | 768;0;0;0;0 | 512;256;384;640;512 |
324 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 189 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 31.00 | 285752320 | 1777440.00 | 1914602.67 | 10.00 | 77.40 | 9217.82 | false | 0.099916;0.100003;0.099974;0.099695;0.099927 | 285752320;285752320;285752320;285752320;285752320 | 1801184;1779744;1809824;1751392;1744352 | 1913664;1917152;1933888;1895424;1912992 |
324 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 189 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 16.00 | 15310848 | 7722.67 | 4903221.33 | 18.10 | 3.12 | 956.93 | true | 0.181198;0.179715;0.184011;0.182074;0.181170 | 15310848;15310848;15310848;15310848;15310848 | 8448;7424;6464;8832;7296 | 4897408;4913760;4898496;4948352;4896960 |
324 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 189 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, false>(float2*, float const*, int, int3, int3, int2, int2) | 15.00 | 1161984 | 222368.00 | 625557.33 | 2.00 | 1.37 | 77.47 | true | 0.019950;0.019953;0.019964;0.019969;0.019959 | 1161984;1161984;1161984;1161984;1161984 | 625152;629120;622784;610944;628736 | 222368;222368;222368;222368;222368 |
324 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 189 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1204480.00 | 834272.00 | 43.90 | 0.00 | 0.00 | true | 0.440925;0.439295;0.439034;0.437217;0.437142 | 0;0;0;0;0 | 1204480;1204480;1204480;1206528;1204480 | 832416;841952;837472;827104;832928 |
324 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 189 | 259072 | 9289984 | 171938560 | GPU_0_bfc | 9030912 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, false>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 1728.00 | 96192.00 | 2.30 | 12.99 | 254.37 | true | 0.022996;0.023011;0.022995;0.022994;0.022995 | 1271872;1271872;1271872;1271872;1271872 | 1184;1568;6560;1952;1664 | 96416;91392;94144;98016;103552 |
325 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 208 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 36.67 | 380608512 | 7020384.00 | 2699488.00 | 11.10 | 39.16 | 10380.14 | false | 0.110712;0.110614;0.110590;0.110617;0.110536 | 380608512;380608512;380608512;380608512;380608512 | 6934272;6982784;7068928;7061504;7016864 | 2713920;2702080;2702304;2674720;2694080 |
325 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 208 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 25.67 | 20414464 | 96650.67 | 7096714.67 | 20.20 | 2.84 | 795.36 | true | 0.201149;0.202094;0.202480;0.201689;0.201346 | 20414464;20414464;20414464;20414464;20414464 | 7095008;7076640;7110304;7109408;7085728 | 93984;95840;98656;98464;95648 |
325 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 208 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.67 | 0 | 1606080.00 | 1023488.00 | 44.30 | 0.00 | 0.00 | true | 0.440716;0.443684;0.447896;0.444213;0.439919 | 0;0;0;0;0 | 1036128;1022464;1024512;1023072;1022880 | 1607872;1606080;1606080;1606080;1606080 |
325 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 208 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.33 | 1355648 | 259872.00 | 661728.00 | 2.30 | 1.47 | 131.20 | true | 0.023173;0.023173;0.023166;0.023165;0.023174 | 1355648;1355648;1355648;1355648;1355648 | 653024;667936;658144;664256;662784 | 259872;259872;259872;259872;259872 |
325 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 208 | 295936 | 11128576 | 172012544 | GPU_0_bfc | 10832640 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 6.00 | 1453568 | 5280.00 | 48202.67 | 2.60 | 27.18 | 242.26 | false | 0.026235;0.026238;0.026239;0.026238;0.026238 | 1453568;1453568;1453568;1453568;1453568 | 5536;5280;5280;5280;5280 | 44832;49696;47904;51232;47008 |
326 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 29.667 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 64736 | 260160.00 | 18602.67 | 46.00 | 0.23 | 14.94 | true | 0.461310;0.460091;0.458783;0.460958;0.459421 | 64736;64736;64736;64736;64736 | 260160;260160;260160;260160;260160 | 18688;18944;17792;19200;18176 |
327 | InceptionV4/InceptionV4/Mixed_6h/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 21.667 | 295936 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 73984 | 1344.00 | 938.67 | 44.90 | 32.41 | 15.85 | false | 0.449014;0.448579;0.448552;0.448157;0.449022 | 73984;73984;73984;73984;73984 | 1344;1344;1344;1344;1344 | 1024;1024;768;7168;768 |
328 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0c_1x7/Relu | Relu | [[1 224 17 17]] | 21.333 | 259072 | 0 | 171753472 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 256.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.439854;0.438807;0.439638;0.439393;0.439168 | 0;0;0;0;0 | 256;256;256;256;256 | 0;0;0;0;0 |
329 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 35.67 | 333032448 | 4987936.00 | 2383808.00 | 9.90 | 45.18 | 9337.27 | false | 0.099248;0.099313;0.099187;0.099176;0.099152 | 333032448;333032448;333032448;333032448;333032448 | 2374208;2390752;2388576;2355392;2388640 | 4950560;4997024;5016224;5213472;4944800 |
329 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 17.00 | 17862656 | 23018.67 | 5808554.67 | 20.20 | 3.06 | 1050.74 | true | 0.201049;0.199080;0.202753;0.202647;0.201832 | 17862656;17862656;17862656;17862656;17862656 | 5812800;5803360;5809504;5829984;5797984 | 21312;22848;24896;25408;20288 |
329 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1405184.00 | 486421.33 | 43.70 | 0.00 | 0.00 | true | 0.435936;0.438189;0.436635;0.434561;0.438023 | 0;0;0;0;0 | 1405184;1405184;1405184;1405184;1405184 | 470784;491936;480256;495520;487072 |
329 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.33 | 1355648 | 259360.00 | 763552.00 | 2.30 | 1.33 | 131.20 | true | 0.023185;0.023175;0.023178;0.023170;0.023182 | 1355648;1355648;1355648;1355648;1355648 | 774560;761760;748224;754336;778176 | 259360;259360;259360;259360;259360 |
329 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 224 17 17]] | 192.333 | 481024 | 10264832 | 172234496 | GPU_0_bfc | 9783808 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1271872 | 928.00 | 18080.00 | 2.30 | 66.91 | 254.37 | false | 0.022976;0.022979;0.022982;0.022972;0.022971 | 1271872;1271872;1271872;1271872;1271872 | 1184;672;800;928;1056 | 20352;19552;17536;17152;16224 |
330 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 224 17 17]] | 29 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 64736 | 1216.00 | 85.33 | 45.70 | 49.75 | 14.94 | false | 0.458800;0.456685;0.457165;0.457474;0.455857 | 64736;64736;64736;64736;64736 | 0;0;256;256;0 | 1216;8128;1216;1216;1216 |
331 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0d_7x1/Relu | Relu | [[1 224 17 17]] | 22 | 481024 | 0 | 171975424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 181.33 | 0.00 | 44.90 | 0.00 | 0.00 | true | 0.449273;0.449287;0.449060;0.448374;0.448573 | 0;0;0;0;0 | 96;352;96;96;2144 | 0;1408;0;0;0 |
332 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 171.333 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 79.33 | 256975104 | 6592.00 | 162538.67 | 3.10 | 1519.39 | 3239.20 | false | 0.031248;0.031249;0.031248;0.031249;0.031248 | 256975104;256975104;256975104;256975104;256975104 | 157184;160032;165600;164224;163360 | 8640;5568;5568;5312;10688 |
332 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 171.333 | 295936 | 1923584 | 172271360 | GPU_0_bfc | 1627648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1605888.00 | 863989.33 | 43.60 | 0.00 | 0.00 | true | 0.435147;0.437244;0.438104;0.435430;0.436459 | 0;0;0;0;0 | 869152;858528;864160;872864;858656 | 1605888;1605888;1605888;1605888;1605888 |
333 | InceptionV4/InceptionV4/Mixed_6h/Branch_2/Conv2d_0e_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 30.333 | 295936 | 0 | 171790336 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.33 | 73984 | 1024.00 | 11093.33 | 44.40 | 6.11 | 17.07 | true | 0.444183;0.443004;0.444082;0.443934;0.444967 | 73984;73984;73984;73984;73984 | 1024;1024;1024;1024;1024 | 11008;12160;10400;11136;11136 |
335 | InceptionV4/InceptionV4/Mixed_6h/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1024 17 17]] | 25 | 1627648 | 0 | 172234240 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 37248.00 | 61.40 | 0.00 | 0.00 | true | 0.608688;0.614619;0.614654;0.614122;0.612866 | 0;0;0;0;0 | 37888;36864;36992;40576;36224 | 0;0;0;0;0 |
336 | InceptionV4/InceptionV4/Mixed_7a/Branch_2/MaxPool_1a_3x3/MaxPool | MaxPool | [[1 1024 8 8]] | 42 | 262144 | 262144 | 172496384 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::maxpooling_func<float, (cudnnNanPropagation_t)0>, 0, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.00 | 65536 | 7424.00 | 27274.67 | 28.70 | 1.89 | 9.36 | true | 0.287352;0.287225;0.287053;0.287317;0.286783 | 65536;65536;65536;65536;65536 | 7168;7936;7168;7168;9216 | 26880;41856;27008;27936;26240 |
337 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 143.667 | 295936 | 1344512 | 172792320 | GPU_0_bfc | 1048576 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 167846144 | 0.00 | 413824.00 | 3.10 | 405.60 | 3166.91 | false | 0.031244;0.031244;0.031243;0.031243;0.031243 | 167846144;167846144;167846144;167846144;167846144 | 414016;416512;414560;412896;408128 | 0;0;0;256;0 |
337 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 143.667 | 295936 | 1344512 | 172792320 | GPU_0_bfc | 1048576 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.33 | 0 | 1048576.00 | 720874.67 | 43.70 | 0.00 | 0.00 | true | 0.437647;0.435114;0.438847;0.428901;0.440698 | 0;0;0;0;0 | 1048576;1048576;1048576;1048576;1048576 | 717536;704352;720960;724128;727008 |
338 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 139.667 | 221952 | 1008384 | 173014272 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 53.00 | 125884608 | 0.00 | 181045.33 | 3.10 | 695.32 | 2375.18 | false | 0.031246;0.031246;0.031246;0.031246;0.031246 | 125884608;125884608;125884608;125884608;125884608 | 0;0;0;0;0 | 180480;179232;180224;182464;182432 |
338 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 192 17 17]] | 139.667 | 221952 | 1008384 | 173014272 | GPU_0_bfc | 786432 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 8.00 | 0 | 786432.00 | 36010.67 | 43.70 | 0.00 | 0.00 | true | 0.436611;0.440105;0.432787;0.435464;0.439481 | 0;0;0;0;0 | 786432;786432;791552;786432;786432 | 37248;35840;35840;36352;35328 |
339 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 29 | 295936 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 73984 | 1280.00 | 30037.33 | 44.40 | 2.36 | 18.50 | true | 0.443048;0.444860;0.444455;0.443805;0.444595 | 73984;73984;73984;73984;73984 | 1280;3840;1280;1280;1280 | 30336;31488;30336;29440;29056 |
340 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 192 17 17]] | 23 | 221952 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 55488 | 768.00 | 23296.00 | 44.30 | 2.31 | 13.87 | true | 0.442015;0.443020;0.444204;0.442324;0.442678 | 55488;55488;55488;55488;55488 | 768;768;768;768;768 | 23552;23424;23680;22016;22912 |
341 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 256 17 17]] | 21.333 | 295936 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 85.33 | 554.67 | 43.80 | 0.00 | 0.00 | true | 0.437819;0.437337;0.437634;0.437648;0.437853 | 0;0;0;0;0 | 256;0;2048;0;0 | 512;640;256;640;512 |
342 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 192 17 17]] | 19.333 | 221952 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 0.00 | 43.60 | 0.00 | 0.00 | true | 0.436298;0.436191;0.435833;0.436130;0.436503 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
343 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 188.333 | 295936 | 2130944 | 171682560 | GPU_0_bfc | 1835008 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 89.00 | 293675264 | 128.00 | 277056.00 | 3.10 | 1059.50 | 3299.72 | false | 0.031246;0.031247;0.031246;0.031246;0.031247 | 293675264;293675264;293675264;293675264;293675264 | 128;128;128;128;128 | 283008;274144;278624;273536;278400 |
343 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/mul | Conv2D | [[1 256 17 17]] | 188.333 | 295936 | 2130944 | 171682560 | GPU_0_bfc | 1835008 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 13.00 | 0 | 1835093.33 | 842730.67 | 44.10 | 0.00 | 0.00 | true | 0.438236;0.440975;0.441856;0.440229;0.445566 | 0;0;0;0;0 | 1835008;1835008;1835264;1835008;1841664 | 829248;843392;840288;852896;844512 |
344 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 8 8]] | 177.667 | 49152 | 1376256 | 171435776 | GPU_0_bfc | 1327104 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 86.00 | 42479616 | 85.33 | 3989.33 | 3.10 | 10425.30 | 493.95 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 42479616;42479616;42479616;42479616;42479616 | 3840;3968;4128;3872;4896 | 128;128;384;0;0 |
344 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 192 8 8]] | 177.667 | 49152 | 1376256 | 171435776 | GPU_0_bfc | 1327104 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.33 | 0 | 1327104.00 | 105696.00 | 43.20 | 0.00 | 0.00 | true | 0.432897;0.430584;0.431817;0.432667;0.431723 | 0;0;0;0;0 | 1327104;1327104;1327104;1327104;1327104 | 107552;109248;105664;103872;102304 |
345 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0b_1x7/BatchNorm/batchnorm/add_1 | Add | [[1 256 17 17]] | 29 | 295936 | 0 | 171213824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 73984 | 1152.00 | 469.33 | 45.20 | 45.63 | 15.85 | false | 0.451909;0.451318;0.451444;0.450937;0.451879 | 73984;73984;73984;73984;73984 | 512;128;768;384;512 | 1152;1152;1152;2176;1152 |
346 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 192 8 8]] | 22 | 49152 | 0 | 171213824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 12288 | 768.00 | 0.00 | 45.50 | 16.00 | 3.07 | true | 0.452432;0.455246;0.455121;0.456457;0.455542 | 12288;12288;12288;12288;12288 | 768;768;768;768;768 | 0;0;0;0;0 |
347 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0b_1x7/Relu | Relu | [[1 256 17 17]] | 20 | 295936 | 0 | 171213824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 170.67 | 43.70 | 0.00 | 0.00 | true | 0.437196;0.437801;0.438080;0.436979;0.436925 | 0;0;0;0;0 | 0;0;0;0;0 | 256;128;256;128;0 |
348 | InceptionV4/InceptionV4/Mixed_7a/Branch_0/Conv2d_1a_3x3/Relu | Relu | [[1 192 8 8]] | 20.333 | 49152 | 0 | 171213824 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.438905;0.439225;0.438851;0.439103;0.438980 | 0;0;0;0;0 | 0;0;0;0;1536 | 0;0;0;0;0 |
349 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 17 17]] | 236 | 468736 | 15235840 | 171682560 | GPU_0_bfc | 14767104 | 0 | 0 | 0 | volta_gcgemm_64x32_nt | 57.00 | 543303680 | 11247797.33 | 3123669.33 | 13.30 | 37.80 | 9531.64 | false | 0.132772;0.133381;0.133601;0.133161;0.133381 | 543303680;543303680;543303680;543303680;543303680 | 3114752;3110144;3137312;3126048;3130208 | 11247456;11252320;11243616;11272672;11238880 |
349 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 17 17]] | 236 | 468736 | 15235840 | 171682560 | GPU_0_bfc | 14767104 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, true, false>(float2*, float const*, int, int3, int3, int2, int2) | 31.00 | 29163520 | 774165.33 | 10415274.67 | 22.50 | 2.61 | 940.76 | true | 0.225002;0.225639;0.225334;0.225647;0.224646 | 29163520;29163520;29163520;29163520;29163520 | 775808;771584;777088;775104;769408 | 10422848;10423008;10387008;10401248;10421728 |
349 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 17 17]] | 236 | 468736 | 15235840 | 171682560 | GPU_0_bfc | 14767104 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 14.33 | 0 | 2293760.00 | 659413.33 | 44.10 | 0.00 | 0.00 | true | 0.439403;0.440350;0.442707;0.441129;0.440594 | 0;0;0;0;0 | 2293760;2293760;2293760;2293760;2293760 | 664448;652352;661088;659104;658048 |
349 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 17 17]] | 236 | 468736 | 15235840 | 171682560 | GPU_0_bfc | 14767104 | 0 | 0 | 0 | void fft1d_r2c_32<float, float, float2, false, true>(float2*, float const*, int, int3, int3, int2, int2) | 10.67 | 1549312 | 296608.00 | 833962.67 | 2.60 | 1.37 | 145.24 | true | 0.026479;0.026502;0.026496;0.026519;0.026494 | 1549312;1549312;1549312;1549312;1549312 | 825984;830976;844928;847232;824448 | 296608;296608;296608;296608;296608 |
349 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/mul | Conv2D | [[1 320 17 17]] | 236 | 468736 | 15235840 | 171682560 | GPU_0_bfc | 14767104 | 0 | 0 | 0 | void fft1d_c2r_32<float2, float, float, false, true, false, true>(float*, float2 const*, int, int3, int3, int2, int, float, float, float*, float*) | 5.00 | 1816960 | 928.00 | 26410.67 | 3.20 | 66.46 | 363.39 | false | 0.032388;0.032374;0.032375;0.032372;0.032374 | 1816960;1816960;1816960;1816960;1816960 | 928;928;928;928;3488 | 26624;25856;25472;26752;33024 |
350 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/BatchNorm/batchnorm/add_1 | Add | [[1 320 17 17]] | 29 | 468736 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 92480 | 1600.00 | 170.67 | 49.30 | 52.23 | 23.12 | false | 0.492239;0.493263;0.493485;0.492687;0.492874 | 92480;92480;92480;92480;92480 | 1600;1600;1600;1600;1600 | 512;0;0;0;640 |
351 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_0c_7x1/Relu | Relu | [[1 320 17 17]] | 21 | 468736 | 0 | 171386624 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 96.00 | 0.00 | 50.70 | 0.00 | 0.00 | true | 0.506579;0.506212;0.506482;0.507825;0.507277 | 0;0;0;0;0 | 96;96;96;96;96 | 0;0;0;0;0 |
352 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 8 8]] | 296.667 | 81920 | 3768320 | 171468544 | GPU_0_bfc | 3686400 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 188.33 | 117985280 | 1600170.67 | 682592.00 | 3.10 | 51.69 | 626.47 | false | 0.031250;0.031250;0.031250;0.031250;0.031250 | 117985280;117985280;117985280;117985280;117985280 | 1582464;1599616;1635712;1601536;1599360 | 680768;669664;709536;673728;693280 |
352 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/mul | Conv2D | [[1 320 8 8]] | 296.667 | 81920 | 3768320 | 171468544 | GPU_0_bfc | 3686400 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 21.67 | 0 | 3704533.33 | 2186720.00 | 44.50 | 0.00 | 0.00 | true | 0.445645;0.445573;0.444957;0.445213;0.445170 | 0;0;0;0;0 | 3703872;3702144;3716544;3698240;3707584 | 2190208;2181888;2197376;2188064;2179232 |
353 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_1a_3x3/BatchNorm/batchnorm/add_1 | Add | [[1 320 8 8]] | 28.333 | 81920 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 20480 | 1600.00 | 24586.67 | 46.40 | 0.78 | 5.12 | true | 0.464623;0.464301;0.463971;0.465274;0.463245 | 20480;20480;20480;20480;20480 | 1600;2112;1600;1600;1600 | 24704;25056;23328;24832;24224 |
354 | InceptionV4/InceptionV4/Mixed_7a/Branch_1/Conv2d_1a_3x3/Relu | Relu | [[1 320 8 8]] | 21 | 81920 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 64.00 | 44.20 | 0.00 | 0.00 | true | 0.441362;0.441743;0.441371;0.441641;0.441802 | 0;0;0;0;0 | 0;0;0;0;0 | 64;64;64;64;64 |
356 | InceptionV4/InceptionV4/Mixed_7b/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1536 8 8]] | 43.333 | 393216 | 393216 | 171468544 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.67 | 1716066 | 7680.00 | 171914.67 | 39.70 | 9.56 | 223.82 | true | 0.396734;0.396500;0.397724;0.397094;0.400441 | 1716066;1716066;1716066;1716066;1716066 | 7680;7680;7680;7680;7680 | 173664;178048;163808;174688;167392 |
357 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 174.667 | 98304 | 2457600 | 171566848 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 75522048 | 0.00 | 203605.33 | 3.10 | 370.92 | 993.71 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 0;0;5120;0;0 | 201248;206752;214464;202816;199200 |
357 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 174.667 | 98304 | 2457600 | 171566848 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.00 | 0 | 2359296.00 | 1829248.00 | 45.00 | 0.00 | 0.00 | true | 0.452384;0.449150;0.448426;0.448942;0.450748 | 0;0;0;0;0 | 2359296;2359296;2359296;2361600;2359296 | 1834944;1823328;1807040;1832384;1832032 |
358 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 165.333 | 98304 | 2457600 | 171665152 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 75522048 | 0.00 | 7125.33 | 3.10 | 10599.09 | 993.71 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 0;0;0;0;0 | 6528;7424;9344;7424;5504 |
358 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 165.333 | 98304 | 2457600 | 171665152 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.33 | 0 | 2359296.00 | 316330.67 | 43.40 | 0.00 | 0.00 | true | 0.433357;0.433914;0.434123;0.433431;0.433922 | 0;0;0;0;0 | 2359296;2360832;2359296;2359296;2359296 | 316064;317888;313728;317312;315616 |
359 | InceptionV4/InceptionV4/Mixed_7b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 168.333 | 114688 | 1687552 | 171779840 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 50348032 | 0.00 | 768.00 | 3.10 | 65557.33 | 662.47 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 256;0;0;0;0 | 896;896;640;512;768 |
359 | InceptionV4/InceptionV4/Mixed_7b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 168.333 | 114688 | 1687552 | 171779840 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1572864.00 | 120810.67 | 43.10 | 0.00 | 0.00 | true | 0.430231;0.430257;0.430186;0.431437;0.434536 | 0;0;0;0;0 | 1572864;1572864;1586688;1572864;1572864 | 121056;120480;119392;121408;120896 |
360 | InceptionV4/InceptionV4/Mixed_7b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 184.667 | 65536 | 1638400 | 171376640 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 100.00 | 50348032 | 393216.00 | 312074.67 | 3.10 | 71.39 | 503.48 | false | 0.031249;0.031249;0.031249;0.031249;0.031250 | 50348032;50348032;50348032;50348032;50348032 | 311584;314112;315264;308352;310528 | 393216;393216;393216;393216;393216 |
360 | InceptionV4/InceptionV4/Mixed_7b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 184.667 | 65536 | 1638400 | 171376640 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1572864.00 | 246741.33 | 43.50 | 0.00 | 0.00 | true | 0.438753;0.434957;0.433534;0.433769;0.436707 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 246272;245632;242432;250368;248320 |
361 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 28.333 | 98304 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 5.00 | 24576 | 97152.00 | 75349.33 | 46.40 | 0.14 | 4.92 | true | 0.465690;0.463991;0.463764;0.463901;0.465037 | 24576;24576;24576;24576;24576 | 97408;97024;97024;96640;98304 | 75520;75392;75136;74880;75520 |
362 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 23 | 98304 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 1621.33 | 0.00 | 45.40 | 15.16 | 6.14 | true | 0.453198;0.454092;0.453571;0.453963;0.454834 | 24576;24576;24576;24576;24576 | 1792;1792;1536;1536;1536 | 0;0;0;0;8576 |
363 | InceptionV4/InceptionV4/Mixed_7b/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 20.667 | 114688 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 1024.00 | 45.60 | 8.00 | 4.10 | true | 0.455756;0.456129;0.455937;0.455879;0.455391 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 1280;1024;1024;1024;1024 |
364 | InceptionV4/InceptionV4/Mixed_7b/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 21 | 65536 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 0.00 | 45.40 | 16.00 | 4.10 | true | 0.450848;0.456022;0.449794;0.456433;0.455648 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 0;128;0;0;0 |
365 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 20.667 | 98304 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 896.00 | 44.10 | 0.00 | 0.00 | true | 0.441413;0.441650;0.441177;0.440846;0.441530 | 0;0;0;0;0 | 0;0;0;0;0 | 896;640;896;896;1024 |
366 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 20 | 98304 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 0.00 | 43.90 | 0.00 | 0.00 | true | 0.439179;0.439548;0.439324;0.439313;0.439212 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;128 |
367 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 152 | 114688 | 2179072 | 171098112 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.67 | 66088960 | 0.00 | 32714.67 | 3.10 | 2020.16 | 1126.51 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 66088960;66088960;66088960;66088960;66088960 | 0;0;0;0;0 | 32800;32576;32416;33088;32768 |
367 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 152 | 114688 | 2179072 | 171098112 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 14.00 | 0 | 2064448.00 | 208234.67 | 42.80 | 0.00 | 0.00 | true | 0.428144;0.427593;0.427697;0.428391;0.429374 | 0;0;0;0;0 | 2064448;2064448;2064448;2064448;2064448 | 207808;209568;207840;207840;209024 |
368 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 147 | 98304 | 1277952 | 171098112 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.67 | 37765120 | 0.00 | 768.00 | 3.10 | 49173.33 | 643.72 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 0;0;0;0;0 | 384;6784;640;1024;640 |
368 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 147 | 98304 | 1277952 | 171098112 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1179648.00 | 89088.00 | 43.40 | 0.00 | 0.00 | true | 0.431936;0.434496;0.433340;0.435661;0.435714 | 0;0;0;0;0 | 1179648;1179648;1179648;1179648;1179648 | 89152;88800;89536;89312;88512 |
369 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 146.667 | 65536 | 1245184 | 171163648 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 37765120 | 0.00 | 213.33 | 3.10 | 177024.28 | 640.09 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 0;256;256;128;256 | 0;0;0;0;0 |
369 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 146.667 | 65536 | 1245184 | 171163648 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1179648.00 | 186890.67 | 43.30 | 0.00 | 0.00 | true | 0.432213;0.433883;0.433988;0.432010;0.433128 | 0;0;0;0;0 | 1179648;1179648;1179648;1179648;1179648 | 187104;181376;186944;186624;187264 |
370 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 448 8 8]] | 28.333 | 114688 | 0 | 171065344 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 28672 | 2048.00 | 256.00 | 46.30 | 12.44 | 7.17 | true | 0.464045;0.463086;0.463512;0.463261;0.462443 | 28672;28672;28672;28672;28672 | 256;256;256;256;256 | 2048;2048;3840;2048;2048 |
371 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 22.667 | 98304 | 0 | 171065344 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 0.00 | 45.50 | 16.00 | 4.10 | true | 0.455637;0.454769;0.455237;0.454278;0.456615 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 0;0;0;0;128 |
372 | InceptionV4/InceptionV4/Mixed_7b/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 20.333 | 65536 | 0 | 171065344 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 85.33 | 45.60 | 14.77 | 4.10 | true | 0.455514;0.456008;0.455471;0.455938;0.454980 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 128;0;128;0;128 |
373 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0b_3x1/Relu | Relu | [[1 448 8 8]] | 21 | 114688 | 0 | 171065344 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 768.00 | 44.20 | 0.00 | 0.00 | true | 0.441559;0.441750;0.441393;0.441590;0.441584 | 0;0;0;0;0 | 768;768;768;768;768 | 0;0;0;0;0 |
374 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 162.667 | 131072 | 2883584 | 171196416 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 68.00 | 88113152 | 0.00 | 187541.33 | 3.10 | 469.83 | 1295.78 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 88113152;88113152;88113152;88113152;88113152 | 0;0;0;0;0 | 188288;185024;184768;191712;189312 |
374 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 162.667 | 131072 | 2883584 | 171196416 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 17.00 | 0 | 2752640.00 | 1035264.00 | 43.80 | 0.00 | 0.00 | true | 0.440280;0.437111;0.436985;0.438383;0.437751 | 0;0;0;0;0 | 2752640;2752640;2752640;2752640;2752640 | 1031424;1037920;1042688;1027264;1036448 |
375 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 512 8 8]] | 27.667 | 131072 | 0 | 171081728 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 32768 | 2048.00 | 512.00 | 45.70 | 12.80 | 8.19 | true | 0.456181;0.457060;0.453660;0.457764;0.457790 | 32768;32768;32768;32768;32768 | 512;512;512;512;384 | 2048;2048;2048;2048;2048 |
376 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0c_1x3/Relu | Relu | [[1 512 8 8]] | 22.333 | 131072 | 0 | 171081728 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 85.33 | 44.20 | 0.00 | 0.00 | true | 0.442146;0.441681;0.441527;0.441753;0.441632 | 0;0;0;0;0 | 0;192;64;192;0 | 0;0;0;0;0 |
377 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 164 | 98304 | 1671168 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 50348032 | 0.00 | 14773.33 | 3.10 | 3408.03 | 662.47 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 0;0;0;0;0 | 15200;14944;16352;14176;14176 |
377 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 164 | 98304 | 1671168 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1572864.00 | 228021.33 | 43.20 | 0.00 | 0.00 | true | 0.431848;0.432388;0.431052;0.432194;0.432516 | 0;0;0;0;0 | 1572864;1574912;1572864;1572864;1572864 | 230816;228192;229664;226208;225280 |
378 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 162.667 | 114688 | 1687552 | 171294720 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.33 | 50348032 | 170.67 | 38272.00 | 3.10 | 1309.69 | 659.58 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 0;5120;512;0;0 | 57088;34944;45056;34816;34432 |
378 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 162.667 | 114688 | 1687552 | 171294720 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1572864.00 | 146122.67 | 43.20 | 0.00 | 0.00 | true | 0.432176;0.430008;0.433893;0.432592;0.431335 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 148352;145920;135808;144096;149280 |
379 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 27.333 | 98304 | 0 | 171163648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 3072.00 | 45.80 | 4.00 | 4.10 | true | 0.457927;0.457565;0.457617;0.457758;0.457746 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 11648;7168;1024;1024;1024 |
380 | InceptionV4/InceptionV4/Mixed_7b/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 21.333 | 114688 | 0 | 171163648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 2517.33 | 45.50 | 4.63 | 4.10 | true | 0.448666;0.454419;0.456147;0.455878;0.453578 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 13312;3072;1408;1536;2944 |
382 | InceptionV4/InceptionV4/Mixed_7b/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1536 8 8]] | 24.333 | 393216 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 597.33 | 33578.67 | 52.40 | 0.00 | 0.00 | true | 0.523750;0.524003;0.523501;0.523877;0.523968 | 0;0;0;0;0 | 0;1792;0;0;5632 | 32640;35456;32640;32640;36608 |
383 | InceptionV4/InceptionV4/Mixed_7c/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1536 8 8]] | 41.333 | 557056 | 557056 | 171556864 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.00 | 1645026 | 0.00 | 166645.33 | 36.20 | 9.87 | 235.00 | true | 0.361371;0.361812;0.364225;0.361985;0.361567 | 1645026;1645026;1645026;1645026;1645026 | 0;0;0;0;0 | 143488;163584;170848;176512;165504 |
384 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 171 | 98304 | 2457600 | 171655168 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 77.00 | 75522048 | 2688.00 | 235904.00 | 3.10 | 316.53 | 980.81 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 2688;2816;2560;2816;2560 | 235360;235552;237792;234752;236800 |
384 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 171 | 98304 | 2457600 | 171655168 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.33 | 0 | 2359360.00 | 975797.33 | 43.60 | 0.00 | 0.00 | true | 0.434796;0.437928;0.434572;0.436520;0.436725 | 0;0;0;0;0 | 2359360;2359360;2359360;2359360;2359360 | 976288;974496;976608;980960;971840 |
385 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 173.333 | 98304 | 2457600 | 171753472 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 75522048 | 0.00 | 60330.67 | 3.10 | 1251.80 | 993.71 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 0;0;0;0;0 | 61408;60928;59200;60864;58080 |
385 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 173.333 | 98304 | 2457600 | 171753472 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.67 | 0 | 2359296.00 | 223584.00 | 43.50 | 0.00 | 0.00 | true | 0.434812;0.434828;0.435596;0.435695;0.435460 | 0;0;0;0;0 | 2359296;2359296;2359296;2359296;2359296 | 222784;222912;225056;221856;229856 |
386 | InceptionV4/InceptionV4/Mixed_7c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 170.667 | 65536 | 1638400 | 171819008 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 50348032 | 0.00 | 1578.67 | 3.10 | 31892.75 | 662.47 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 0;0;0;0;0 | 1536;1536;1920;1536;1664 |
386 | InceptionV4/InceptionV4/Mixed_7c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 170.667 | 65536 | 1638400 | 171819008 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1572864.00 | 54634.67 | 43.10 | 0.00 | 0.00 | true | 0.432357;0.432071;0.429871;0.431182;0.428651 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 54400;54432;55072;53440;55936 |
387 | InceptionV4/InceptionV4/Mixed_7c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 189.333 | 65536 | 1638400 | 171491328 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 100.67 | 50348032 | 393216.00 | 283733.33 | 3.10 | 74.37 | 500.14 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 393216;393216;393216;393216;393216 | 284288;280064;286080;289664;280832 |
387 | InceptionV4/InceptionV4/Mixed_7c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 189.333 | 65536 | 1638400 | 171491328 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1572864.00 | 177706.67 | 43.30 | 0.00 | 0.00 | true | 0.431015;0.434757;0.433302;0.434019;0.431896 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 177408;181376;175360;172160;180352 |
388 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 29 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 24576 | 100096.00 | 80170.67 | 46.40 | 0.14 | 5.27 | true | 0.462144;0.464883;0.464628;0.464399;0.463797 | 24576;24576;24576;24576;24576 | 80384;80128;80000;80512;79360 | 100096;100096;100096;100096;100096 |
389 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 21.667 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 1536.00 | 1621.33 | 45.40 | 7.78 | 6.14 | true | 0.454641;0.453812;0.452961;0.455325;0.454638 | 24576;24576;24576;24576;24576 | 1536;1536;1536;1536;1536 | 0;4864;0;0;7680 |
390 | InceptionV4/InceptionV4/Mixed_7c/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 23.333 | 65536 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 256.00 | 45.40 | 12.80 | 4.10 | true | 0.454143;0.455730;0.454120;0.455001;0.454020 | 16384;16384;16384;16384;16384 | 256;256;256;256;256 | 1024;1024;1024;1024;1024 |
391 | InceptionV4/InceptionV4/Mixed_7c/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 22 | 65536 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 1024.00 | 45.60 | 8.00 | 4.10 | true | 0.452187;0.455289;0.455830;0.456354;0.456274 | 16384;16384;16384;16384;16384 | 6656;1024;1024;1024;1024 | 1024;1024;1024;1024;1024 |
392 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 20.333 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.00 | 0 | 0.00 | 0.00 | 44.20 | 0.00 | 0.00 | true | 0.441551;0.442029;0.442089;0.441751;0.441596 | 0;0;0;0;0 | 17920;0;0;0;0 | 0;0;0;0;0 |
393 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 21.333 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 0.00 | 43.80 | 0.00 | 0.00 | true | 0.438123;0.438643;0.437858;0.438546;0.438134 | 0;0;0;0;0 | 0;0;0;0;0 | 0;0;0;0;0 |
394 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 152.333 | 114688 | 2179072 | 171048960 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 66088960 | 0.00 | 1984.00 | 3.10 | 33310.97 | 1120.15 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 66088960;66088960;66088960;66088960;66088960 | 0;0;0;0;0 | 2688;1952;2080;1696;1920 |
394 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 152.333 | 114688 | 2179072 | 171048960 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 14.00 | 0 | 2064448.00 | 266613.33 | 42.70 | 0.00 | 0.00 | true | 0.427709;0.429720;0.425474;0.425641;0.428330 | 0;0;0;0;0 | 2064448;2069568;2064448;2064448;2064448 | 248896;268576;269088;269312;262176 |
395 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 147.333 | 65536 | 1245184 | 171016192 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 62.00 | 37765120 | 20053.33 | 128.00 | 3.10 | 1871.29 | 609.11 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 128;128;128;640;128 | 20608;19200;19840;31360;19712 |
395 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 147.333 | 65536 | 1245184 | 171016192 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1179648.00 | 119925.33 | 43.50 | 0.00 | 0.00 | true | 0.434334;0.434826;0.431482;0.435019;0.436027 | 0;0;0;0;0 | 1179648;1179648;1179648;1179648;1179648 | 119968;119456;119872;120000;119936 |
396 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 143 | 65536 | 1245184 | 171081728 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 37765120 | 64.00 | 149.33 | 3.10 | 177024.28 | 640.09 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 192;192;64;64;192 | 64;64;5184;64;64 |
396 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 143 | 65536 | 1245184 | 171081728 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1179648.00 | 59520.00 | 43.40 | 0.00 | 0.00 | true | 0.433679;0.435479;0.433639;0.433314;0.433637 | 0;0;0;0;0 | 1179648;1181696;1179648;1179648;1179648 | 59520;59776;59520;59264;59520 |
397 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 448 8 8]] | 30.667 | 114688 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.67 | 28672 | 2048.00 | 0.00 | 46.40 | 14.00 | 6.14 | true | 0.464134;0.463542;0.463732;0.462336;0.464253 | 28672;28672;28672;28672;28672 | 2048;2048;12032;2048;2048 | 0;0;0;0;0 |
398 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 22 | 65536 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1194.67 | 64.00 | 45.50 | 13.02 | 4.10 | true | 0.453257;0.454926;0.454706;0.455597;0.456006 | 16384;16384;16384;16384;16384 | 7680;1024;1536;1024;1024 | 64;192;64;64;64 |
399 | InceptionV4/InceptionV4/Mixed_7c/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 22.667 | 65536 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 768.00 | 45.50 | 9.14 | 4.10 | true | 0.454751;0.455981;0.454521;0.455823;0.455079 | 16384;16384;16384;16384;16384 | 768;768;768;768;768 | 1024;1024;1024;3072;1024 |
400 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0b_3x1/Relu | Relu | [[1 448 8 8]] | 21.333 | 114688 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 42.67 | 44.10 | 0.00 | 0.00 | true | 0.441288;0.441407;0.441488;0.441771;0.441135 | 0;0;0;0;0 | 0;0;0;0;0 | 128;0;0;0;128 |
401 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 164 | 131072 | 2883584 | 171114496 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.67 | 88113152 | 682.67 | 80085.33 | 3.10 | 1090.94 | 1302.16 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 88113152;88113152;88113152;88113152;88113152 | 768;640;512;896;640 | 80640;81536;75520;81920;78080 |
401 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 164 | 131072 | 2883584 | 171114496 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 17.00 | 0 | 2752576.00 | 1148736.00 | 43.40 | 0.00 | 0.00 | true | 0.433196;0.435209;0.435078;0.432457;0.431156 | 0;0;0;0;0 | 2752512;2752576;2752640;2752576;2752576 | 1142720;1144000;1159648;1152640;1149568 |
402 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 512 8 8]] | 31 | 131072 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 32768 | 2048.00 | 68778.67 | 45.70 | 0.46 | 8.19 | true | 0.453719;0.456568;0.456998;0.457366;0.457396 | 32768;32768;32768;32768;32768 | 2048;2048;2048;8704;2048 | 69504;66176;69248;69504;67584 |
403 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0c_1x3/Relu | Relu | [[1 512 8 8]] | 22 | 131072 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 256.00 | 44.10 | 0.00 | 0.00 | true | 0.441052;0.440823;0.440935;0.440903;0.441013 | 0;0;0;0;0 | 0;0;0;0;0 | 384;128;384;256;128 |
404 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 167.667 | 114688 | 1687552 | 171114496 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 50348032 | 0.00 | 82272.00 | 3.10 | 611.97 | 662.47 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 0;0;0;0;0 | 81728;81440;84640;81824;83264 |
404 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 167.667 | 114688 | 1687552 | 171114496 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1572864.00 | 270058.67 | 43.60 | 0.00 | 0.00 | true | 0.434539;0.437954;0.436075;0.437053;0.432372 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 269952;270496;272416;265184;269728 |
405 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 162.333 | 65536 | 1638400 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.33 | 50348032 | 0.00 | 17450.67 | 3.10 | 2885.16 | 659.58 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 17664;17408;16768;17408;17536 | 0;0;0;0;0 |
405 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 162.333 | 65536 | 1638400 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1572864.00 | 127701.33 | 43.40 | 0.00 | 0.00 | true | 0.431107;0.438743;0.434806;0.433216;0.432885 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 129344;127104;125440;128000;128000 |
406 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 27.667 | 114688 | 0 | 171048960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 1024.00 | 45.80 | 8.00 | 4.10 | true | 0.458056;0.458202;0.458078;0.457868;0.457982 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 1024;1024;1024;1024;1024 |
407 | InceptionV4/InceptionV4/Mixed_7c/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 23 | 65536 | 0 | 171048960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 2304.00 | 45.50 | 4.92 | 4.10 | true | 0.451978;0.454208;0.456362;0.455863;0.456319 | 16384;16384;16384;16384;16384 | 1024;1280;1024;1024;1024 | 0;6272;512;128;10368 |
409 | InceptionV4/InceptionV4/Mixed_7c/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1536 8 8]] | 25.333 | 557056 | 0 | 171163648 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 154880.00 | 52.40 | 0.00 | 0.00 | true | 0.524810;0.524458;0.523716;0.523538;0.523901 | 0;0;0;0;0 | 0;0;768;0;0 | 154880;150272;155264;154752;155008 |
410 | InceptionV4/InceptionV4/Mixed_7d/Branch_3/AvgPool_0a_3x3/AvgPool | AvgPool | [[1 1536 8 8]] | 41.333 | 393216 | 393216 | 171556864 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.00 | 1466766 | 0.00 | 198133.33 | 36.30 | 7.40 | 209.54 | true | 0.363413;0.359538;0.362822;0.362883;0.362116 | 1466766;1466766;1466766;1466766;1466766 | 0;0;0;0;256 | 195808;194784;205280;196128;202464 |
411 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 172.667 | 98304 | 2457600 | 171655168 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 75522048 | 0.00 | 176586.67 | 3.10 | 427.68 | 993.71 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 177824;178976;175904;171968;176032 | 0;0;0;0;256 |
411 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 172.667 | 98304 | 2457600 | 171655168 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.00 | 0 | 2359360.00 | 713429.33 | 43.50 | 0.00 | 0.00 | true | 0.436368;0.434730;0.436328;0.434717;0.435013 | 0;0;0;0;0 | 710208;709184;711712;720096;718368 | 2359360;2359360;2359360;2359360;2361152 |
412 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 170 | 98304 | 2457600 | 171753472 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 75522048 | 1024.00 | 72586.67 | 3.10 | 1025.97 | 993.71 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 75522048;75522048;75522048;75522048;75522048 | 1024;1024;1024;1024;1024 | 72800;70496;73184;71904;73056 |
412 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 384 8 8]] | 170 | 98304 | 2457600 | 171753472 | GPU_0_bfc | 2359296 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 15.33 | 0 | 2359296.00 | 455733.33 | 43.30 | 0.00 | 0.00 | true | 0.435176;0.433916;0.429956;0.432846;0.432666 | 0;0;0;0;0 | 2359296;2359296;2359296;2359296;2359296 | 455168;461408;453504;457056;454976 |
413 | InceptionV4/InceptionV4/Mixed_7d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 166 | 65536 | 1638400 | 171819008 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 50348032 | 0.00 | 2304.00 | 3.10 | 21852.44 | 662.47 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 0;0;0;0;0 | 2560;2048;2304;2048;2560 |
413 | InceptionV4/InceptionV4/Mixed_7d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 166 | 65536 | 1638400 | 171819008 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1573546.67 | 50304.00 | 43.00 | 0.00 | 0.00 | true | 0.426458;0.430396;0.428988;0.429430;0.432061 | 0;0;0;0;0 | 1572864;1572864;1572864;1577984;1574912 | 51712;49664;50304;50944;49568 |
414 | InceptionV4/InceptionV4/Mixed_7d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 186 | 65536 | 1638400 | 171327488 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 101.00 | 50348032 | 393216.00 | 302389.33 | 3.10 | 72.38 | 498.50 | false | 0.031250;0.031249;0.031249;0.031249;0.031250 | 50348032;50348032;50348032;50348032;50348032 | 393216;393216;393216;395264;393216 | 309760;301312;300704;305152;300672 |
414 | InceptionV4/InceptionV4/Mixed_7d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 186 | 65536 | 1638400 | 171327488 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1572864.00 | 173066.67 | 43.70 | 0.00 | 0.00 | true | 0.438480;0.438828;0.435513;0.434498;0.438336 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 171520;170752;176000;174496;173184 |
415 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 28.667 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 100096.00 | 94666.67 | 46.40 | 0.13 | 6.14 | true | 0.462024;0.464766;0.464375;0.464631;0.462994 | 24576;24576;24576;24576;24576 | 100096;100096;100096;100096;100096 | 89280;96128;96000;91904;96096 |
416 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 384 8 8]] | 22.333 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 24576 | 1536.00 | 768.00 | 45.40 | 10.67 | 6.14 | true | 0.454239;0.453536;0.451336;0.454279;0.454822 | 24576;24576;24576;24576;24576 | 1536;1536;1536;1536;1536 | 0;5120;0;2304;0 |
417 | InceptionV4/InceptionV4/Mixed_7d/Branch_0/Conv2d_0a_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 22.333 | 65536 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 1280.00 | 45.50 | 7.11 | 4.10 | true | 0.453287;0.456090;0.452389;0.456249;0.455899 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 1280;1280;1280;1280;1408 |
418 | InceptionV4/InceptionV4/Mixed_7d/Branch_3/Conv2d_0b_1x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 20.667 | 65536 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 0.00 | 45.50 | 16.00 | 4.10 | true | 0.454649;0.455155;0.454675;0.455961;0.455858 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 0;0;4864;0;0 |
419 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 21 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 2474.67 | 44.20 | 0.00 | 0.00 | true | 0.441997;0.442256;0.441760;0.442037;0.441780 | 0;0;0;0;0 | 1024;0;0;0;0 | 7040;384;15488;0;0 |
420 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0a_1x1/Relu | Relu | [[1 384 8 8]] | 19.333 | 98304 | 0 | 170934272 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 1024.00 | 1834.67 | 43.80 | 0.00 | 0.00 | true | 0.437839;0.439053;0.437842;0.438358;0.437971 | 0;0;0;0;0 | 0;3072;0;0;6656 | 2560;1408;1536;6400;0 |
421 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 152.333 | 114688 | 2179072 | 171048960 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.67 | 66088960 | 0.00 | 6090.67 | 3.10 | 10850.86 | 1126.51 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 66088960;66088960;66088960;66088960;66088960 | 6048;6048;6176;6272;5408 | 0;0;0;0;0 |
421 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 448 8 8]] | 152.333 | 114688 | 2179072 | 171048960 | GPU_0_bfc | 2064384 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 14.00 | 0 | 2064448.00 | 158570.67 | 42.80 | 0.00 | 0.00 | true | 0.427073;0.430735;0.428577;0.428518;0.426176 | 0;0;0;0;0 | 2064448;2064448;2064448;2064448;2064448 | 155680;162528;138656;157504;165088 |
422 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 147.333 | 65536 | 1245184 | 171016192 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 59.00 | 37765120 | 1578.67 | 298.67 | 3.10 | 20116.36 | 640.09 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 1536;1664;1536;1664;1536 | 256;256;384;6656;256 |
422 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 147.333 | 65536 | 1245184 | 171016192 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 10.00 | 0 | 1179648.00 | 88917.33 | 43.10 | 0.00 | 0.00 | true | 0.431042;0.431094;0.430230;0.433139;0.431654 | 0;0;0;0;0 | 1179648;1179648;1179648;1179648;1179904 | 88960;88832;89088;88960;84352 |
423 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 144 | 65536 | 1245184 | 171081728 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 58.67 | 37765120 | 0.00 | 256.00 | 3.10 | 147520.00 | 643.72 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 37765120;37765120;37765120;37765120;37765120 | 0;0;0;0;0 | 256;256;256;256;256 |
423 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 144 | 65536 | 1245184 | 171081728 | GPU_0_bfc | 1179648 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 9.67 | 0 | 1179648.00 | 74560.00 | 43.50 | 0.00 | 0.00 | true | 0.435244;0.436126;0.434293;0.433559;0.435142 | 0;0;0;0;0 | 1179648;1179648;1179648;1179648;1179648 | 74720;74688;74624;70144;74368 |
424 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0b_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 448 8 8]] | 28.667 | 114688 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 28672 | 2048.00 | 256.00 | 46.30 | 12.44 | 7.17 | true | 0.463546;0.463282;0.463209;0.463413;0.462993 | 28672;28672;28672;28672;28672 | 2048;2048;2048;2048;2304 | 256;256;256;256;256 |
425 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0c_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 24 | 65536 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 0.00 | 45.50 | 16.00 | 4.10 | true | 0.455715;0.456061;0.454967;0.455741;0.453674 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 0;0;128;0;0 |
426 | InceptionV4/InceptionV4/Mixed_7d/Branch_1/Conv2d_0b_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 21.667 | 65536 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 85.33 | 45.50 | 14.77 | 4.10 | true | 0.455784;0.454355;0.454523;0.456881;0.454078 | 16384;16384;16384;16384;16384 | 128;0;128;0;128 | 1024;1024;1024;1024;3328 |
427 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0b_3x1/Relu | Relu | [[1 448 8 8]] | 21.667 | 114688 | 0 | 170983424 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.33 | 0 | 0.00 | 768.00 | 44.20 | 0.00 | 0.00 | true | 0.440970;0.441418;0.441365;0.441773;0.441855 | 0;0;0;0;0 | 0;0;0;0;0 | 768;768;768;768;768 |
428 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 165 | 131072 | 2883584 | 171114496 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 67.67 | 88113152 | 0.00 | 36608.00 | 3.10 | 2406.94 | 1302.16 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 88113152;88113152;88113152;88113152;88113152 | 0;0;0;0;2816 | 35776;35744;38304;32288;40128 |
428 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 512 8 8]] | 165 | 131072 | 2883584 | 171114496 | GPU_0_bfc | 2752512 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 17.33 | 0 | 2752512.00 | 1096949.33 | 43.40 | 0.00 | 0.00 | true | 0.435140;0.434259;0.434638;0.433125;0.433507 | 0;0;0;0;0 | 2752512;2752512;2752512;2752512;2752512 | 1100576;1094528;1095744;1085088;1101376 |
429 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 512 8 8]] | 28.333 | 131072 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 32768 | 2048.00 | 113216.00 | 45.70 | 0.28 | 8.19 | true | 0.456875;0.457516;0.455109;0.457947;0.457789 | 32768;32768;32768;32768;32768 | 2048;2048;2048;2048;2048 | 110624;115840;111872;118848;111936 |
430 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0c_1x3/Relu | Relu | [[1 512 8 8]] | 21 | 131072 | 0 | 170999808 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 4.00 | 0 | 0.00 | 810.67 | 44.10 | 0.00 | 0.00 | true | 0.441523;0.441420;0.441157;0.441313;0.441419 | 0;0;0;0;0 | 0;0;2304;0;0 | 768;896;768;896;768 |
431 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 165.333 | 114688 | 1687552 | 171114496 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 50348032 | 0.00 | 41493.33 | 3.10 | 1213.40 | 662.47 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 0;0;0;0;0 | 42944;41792;41152;40864;41536 |
431 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 165.333 | 114688 | 1687552 | 171114496 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.00 | 0 | 1572864.00 | 258464.00 | 43.60 | 0.00 | 0.00 | true | 0.436298;0.435769;0.435063;0.434929;0.436345 | 0;0;0;0;0 | 1577984;1572864;1572864;1572864;1572864 | 255296;254912;260448;259840;260256 |
432 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 161.333 | 65536 | 1638400 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void cudnn::detail::implicit_convolve_sgemm<float, float, 1024, 5, 5, 3, 3, 3, 1, true, false, true>(int, int, int, float const*, int, float*, float*, kernel_conv_params, int, float, float, int, float*, float*, int, int) | 76.00 | 50348032 | 0.00 | 63957.33 | 3.10 | 787.21 | 662.47 | false | 0.031249;0.031249;0.031249;0.031249;0.031249 | 50348032;50348032;50348032;50348032;50348032 | 0;5120;0;0;0 | 64000;63872;63744;64000;64128 |
432 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/mul | Conv2D | [[1 256 8 8]] | 161.333 | 65536 | 1638400 | 171180032 | GPU_0_bfc | 1572864 | 0 | 0 | 0 | void tensorflow::functor::ShuffleInTensor3Simple<float, 2, 1, 0, false>(int, float const*, tensorflow::functor::Dimension<3>, float*) | 11.33 | 0 | 1572864.00 | 150101.33 | 43.50 | 0.00 | 0.00 | true | 0.434581;0.434972;0.432186;0.434563;0.434664 | 0;0;0;0;0 | 1572864;1572864;1572864;1572864;1572864 | 149632;149760;150656;150144;150400 |
433 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0e_3x1/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 27.667 | 114688 | 0 | 171048960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1024.00 | 1280.00 | 45.80 | 7.11 | 4.10 | true | 0.457628;0.457650;0.458241;0.457960;0.457491 | 16384;16384;16384;16384;16384 | 1024;1024;1024;1024;1024 | 1280;6784;1280;1280;1280 |
434 | InceptionV4/InceptionV4/Mixed_7d/Branch_2/Conv2d_0d_1x3/BatchNorm/batchnorm/add_1 | Add | [[1 256 8 8]] | 21.667 | 65536 | 0 | 171048960 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 2, 1, int>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_sum_op<float, float>, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const, Eigen::TensorBroadcastingOp<Eigen::array<long, 2ul> const, Eigen::TensorMap<Eigen::Tensor<float const, 2, 1, int>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, int) | 4.00 | 16384 | 1706.67 | 1066.67 | 45.50 | 5.91 | 4.10 | true | 0.455054;0.454306;0.456445;0.455357;0.455553 | 16384;16384;16384;16384;16384 | 3072;1024;3072;1024;1024 | 1152;1024;1152;1024;1024 |
436 | InceptionV4/InceptionV4/Mixed_7d/Branch_0/Conv2d_0a_1x1/Relu | Relu | [[1 1536 8 8]] | 24.667 | 638976 | 0 | 171245568 | GPU_0_bfc | 0 | 0 | 0 | 0 | void Eigen::internal::EigenMetaKernel<Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long>(Eigen::TensorEvaluator<Eigen::TensorAssignOp<Eigen::TensorMap<Eigen::Tensor<float, 1, 1, long>, 16, Eigen::MakePointer>, Eigen::TensorCwiseBinaryOp<Eigen::internal::scalar_max_op<float const, float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const, Eigen::TensorCwiseNullaryOp<Eigen::internal::scalar_constant_op<float const>, Eigen::TensorMap<Eigen::Tensor<float const, 1, 1, long>, 16, Eigen::MakePointer> const> const> const> const, Eigen::GpuDevice>, long) | 3.67 | 0 | 0.00 | 88618.67 | 52.40 | 0.00 | 0.00 | true | 0.523750;0.524153;0.523780;0.523962;0.523129 | 0;0;0;0;0 | 0;0;0;0;0 | 88096;88768;88288;93088;88800 |
437 | InceptionV4/Logits/AvgPool_1a/AvgPool | AvgPool | [[1 1536 1 1]] | 44.333 | 6144 | 6144 | 171251712 | GPU_0_bfc | 0 | 0 | 0 | 0 | void cudnn::detail::pooling_fw_4d_kernel<float, float, cudnn::detail::averpooling_func<float>, 2, false>(cudnnTensorStruct, float const*, cudnnTensorStruct, float*, cudnnPoolingStruct, float, float, int, cudnn::reduced_divisor, cudnn::reduced_divisor) | 7.67 | 131847 | 2560.00 | 6229.33 | 11.80 | 15.00 | 17.20 | true | 0.118538;0.118411;0.118429;0.118426;0.118415 | 131847;131847;131847;131847;131847 | 2560;2560;2560;2560;2560 | 6272;6144;6272;6144;6272 |
443 | InceptionV4/Logits/PreLogitsFlatten/Prod | Prod | [[]] | 44.333 | 256 | 256 | 170613248 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::functor::BlockReduceKernel<int*, int*, 256, tensorflow::functor::Prod<int> >(int*, int*, int, tensorflow::functor::Prod<int>, std::iterator_traits<int*>::value_type) | 3.67 | 0 | 3072.00 | 213.33 | 12.10 | 0.00 | 0.00 | true | 0.121399;0.121395;0.121397;0.121395;0.121392 | 0;0;0;0;0 | 128;256;4480;256;128 | 3072;3072;3072;3072;3072 |
447 | InceptionV4/Logits/Logits/MatMul | MatMul | [[1 1001]] | 76.333 | 4096 | 4096 | 170616832 | GPU_0_bfc | 0 | 0 | 0 | 0 | void gemv2N_kernel<int, int, float, float, float, 128, 8, 4, 4, 1, cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float> >(cublasGemvParams<cublasGemvTensorStridedBatched<float const>, cublasGemvTensorStridedBatched<float>, float>) | 20.67 | 3202871 | 6157376.00 | 2072586.67 | 6.20 | 0.39 | 154.98 | true | 0.062490;0.062490;0.062491;0.062490;0.062491 | 3202871;3202871;3202871;3202871;3202871 | 2065504;2095904;2048288;2056352;2116896 | 6157376;6157376;6167872;6157376;6157376 |
448 | InceptionV4/Logits/Logits/BiasAdd | BiasAdd | [[1 1001]] | 28.333 | 4096 | 0 | 170610688 | GPU_0_bfc | 0 | 0 | 0 | 0 | void tensorflow::BiasNHWCKernel<float>(int, float const*, float const*, float*, int) | 3.67 | 1001 | 5600.00 | 213.33 | 47.20 | 0.17 | 0.27 | true | 0.471649;0.473316;0.473520;0.471218;0.470624 | 1001;1001;1001;1001;1001 | 1920;0;640;0;0 | 6368;5600;5600;5600;5600 |
449 | InceptionV4/Logits/Predictions | Softmax | [[1 1001]] | 63.667 | 4096 | 10240 | 170610688 | GPU_0_bfc | 10240 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, cub::Sum>(cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long>, float*, int, int, cub::Sum, std::iterator_traits<cub::TransformInputIterator<float, tensorflow::(anonymous namespace)::SubtractAndExpFunctor<float, float>, cub::CountingInputIterator<int, long>, long> >::value_type) | 8.33 | 10431 | 6656.00 | 42.67 | 2.40 | 1.56 | 1.25 | true | 0.023928;0.023915;0.023912;0.024434;0.023796 | 10431;10431;10431;10431;10431 | 0;128;128;0;0 | 6656;6656;6656;6656;6656 |
449 | InceptionV4/Logits/Predictions | Softmax | [[1 1001]] | 63.667 | 4096 | 10240 | 170610688 | GPU_0_bfc | 10240 | 0 | 0 | 0 | void tensorflow::functor::RowReduceKernel<float const*, float*, cub::Max>(float const*, float*, int, int, cub::Max, std::iterator_traits<float const*>::value_type) | 5.00 | 0 | 4096.00 | 85.33 | 3.90 | 0.00 | 0.00 | true | 0.038345;0.039822;0.039747;0.039779;0.038280 | 0;0;0;0;0 | 4096;4096;4096;4096;4864 | 0;0;256;0;2304 |
449 | InceptionV4/Logits/Predictions | Softmax | [[1 1001]] | 63.667 | 4096 | 10240 | 170610688 | GPU_0_bfc | 10240 | 0 | 0 | 0 | void tensorflow::(anonymous namespace)::GenerateNormalizedProb<float, float>(float const*, float const*, float const*, float*, int, int, bool) | 4.00 | 24024 | 2368.00 | 2432.00 | 6.20 | 5.00 | 6.01 | true | 0.062277;0.062249;0.062248;0.062246;0.062261 | 24024;24024;24024;24024;24024 | 2368;2368;2368;2368;2368 | 2432;2432;2688;2432;2432 |
Showing 1 to 708 of 708 entries