Skip to content

Dividing xtensor by a scalar is x20-40 times slower than using std::transform #2849

@vakokako

Description

@vakokako

Dividing xtensor container by some integer scalar (vOutput = vInput / 2), gives super low performance compared to naive implementation with std::transform, while other operations like multiplication (vOutput = vInput * 2), maximum (vOutput = xt::maximum(vInput1, vInput2)) give similar performance to std::transform. We build with xsimd enabled.

Comparing speed of xtensor vs std::transform for different operations:

  • /2: xtensor is x20-40 slower
  • /2.0: xtensor is x2 slower
  • *2: xtensor is 10% slower
  • max: xtensor is same speed

Benchmarks:

static void Xtensor_Uint16_2000x2000_DivideBy2_StdTransform(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue / 2; });
    }
}

static void Xtensor_Uint16_2000x2000_DivideBy2_Xtensor(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        vOutput = vInput / 2;
    }
}

static void Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue / 2.0; });
    }
}

static void Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        vOutput = vInput / 2.0;
    }
}

static void Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        std::transform(vInput.begin(), vInput.end(), vOutput.begin(), [](auto&& aInputValue) { return aInputValue * 2; });
    }
}

static void Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor(benchmark::State& aState) {
    auto vInput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput);

    for (auto _ : aState) {
        vOutput = vInput * 2;
    }
}

static void Xtensor_Uint16_2000x2000_Maximum_StdTransform(benchmark::State& aState) {
    auto vInput1 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vInput2 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput1);
    generateRandomInt16From0To100(vInput2);

    for (auto _ : aState) {
        auto vInput2It = vInput2.begin();
        std::transform(vInput1.begin(), vInput1.end(), vOutput.begin(), [&vInput2It](auto&& aInput1Value) { return std::max(aInput1Value, *vInput2It++); });
    }
}

static void Xtensor_Uint16_2000x2000_Maximum_Xtensor(benchmark::State& aState) {
    auto vInput1 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vInput2 = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    auto vOutput = xt::xtensor<uint16_t, 2>::from_shape(cContainerAssignShape);
    generateRandomInt16From0To100(vInput1);
    generateRandomInt16From0To100(vInput2);

    for (auto _ : aState) {
        vOutput = xt::maximum(vInput1, vInput2);
    }
}

Results on ubuntu:

---------------------------------------------------------------------------------------------------------------
Benchmark                                                                     Time             CPU   Iterations
---------------------------------------------------------------------------------------------------------------
Xtensor_Uint16_2000x2000_DivideBy2_StdTransform                                       114483 ns       114483 ns         6016
Xtensor_Uint16_2000x2000_DivideBy2_Xtensor                                           4295418 ns      4295440 ns          165
Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform                                 442543 ns       442541 ns         1596
Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor                                      821435 ns       821429 ns          837
Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform                                     115849 ns       115845 ns         5901
Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor                                          131595 ns       131594 ns         5328
Xtensor_Uint16_2000x2000_Maximum_StdTransform                                         204465 ns       203952 ns         3156
Xtensor_Uint16_2000x2000_Maximum_Xtensor                                              198696 ns       198692 ns         3466

Results on windows:

---------------------------------------------------------------------------------------------------------------
Benchmark                                                                     Time             CPU   Iterations
---------------------------------------------------------------------------------------------------------------
Xtensor_Uint16_2000x2000_DivideBy2_StdTransform                          764377 ns       767299 ns         1120
Xtensor_Uint16_2000x2000_DivideBy2_Xtensor                             14637306 ns     14687500 ns           50
Xtensor_Uint16_2000x2000_DivideBy2Double_StdTransform                   2954759 ns      2966054 ns          289
Xtensor_Uint16_2000x2000_DivideBy2Double_Xtensor                        5484534 ns      5503462 ns          451
Xtensor_Uint16_2000x2000_MultiplyBy2_StdTransform                        759787 ns       767299 ns          896
Xtensor_Uint16_2000x2000_MultiplyBy2_Xtensor                             888914 ns       889369 ns          896
Xtensor_Uint16_2000x2000_Maximum_StdTransform                            993174 ns       976562 ns          640
Xtensor_Uint16_2000x2000_Maximum_Xtensor                                 985171 ns      1000977 ns          640

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions