AliceO2Group · davidrohr · Jun 3, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026
@@ -15,6 +15,7 @@
 #ifndef ALICEO2_DATAFORMATSTPC_CLUSTERNATIVE_H
 #define ALICEO2_DATAFORMATSTPC_CLUSTERNATIVE_H
 #ifndef GPUCA_GPUCODE_DEVICE
+#include <climits>
 #include <cstdint>
 #include <cstddef> // for size_t
 #include <utility>
@@ -62,6 +63,8 @@ struct ClusterNative {
   static constexpr int scalePadPacked = 64;       //< ~60 is needed for 0.1mm precision, but power of two avoids rounding
   static constexpr int scaleSigmaTimePacked = 32; // 1/32nd of pad/timebin precision for cluster size
   static constexpr int scaleSigmaPadPacked = 32;
+  static constexpr int scaleSaturatedQTot = 4;
+  static constexpr int maxSaturatedQTot = USHRT_MAX * scaleSaturatedQTot;
 
   uint32_t timeFlagsPacked; //< Contains the time in the lower 24 bits in a packed format, contains the flags in the
                             // upper 8 bits
@@ -83,7 +86,15 @@ struct ClusterNative {
   }
 
   GPUd() uint16_t getQmax() const { return qMax; }
-  GPUd() uint16_t getQtot() const { return qTot; }
+  GPUd() uint16_t getQtot() const
+  {
+    if (isSaturated()) [[unlikely]] {
+      // Check for overflow, so return type can stay uint16
+      auto sqtot = getSaturatedQtot();
+      return sqtot <= USHRT_MAX ? sqtot : USHRT_MAX;
+    }
+    return qTot;
+  }
   GPUd() uint8_t getFlags() const { return timeFlagsPacked >> 24; }
   GPUd() uint32_t getTimePacked() const { return timeFlagsPacked & 0xFFFFFF; }
   GPUd() void setTimePackedFlags(uint32_t timePacked, uint8_t flags)
@@ -119,7 +130,13 @@ struct ClusterNative {
   /// Y = (12.4 - 0.5 * (66 - 1)) * 4.16mm = -83.616mm
   GPUd() float getPad() const { return unpackPad(padPacked); }
   GPUd() void setPad(float pad) { padPacked = packPad(pad); }
-  GPUd() float getSigmaTime() const { return float(sigmaTimePacked) * (1.f / scaleSigmaTimePacked); }
+  GPUd() float getSigmaTime() const
+  {
+    if (isSaturated()) [[unlikely]] {
+      return 0;
+    }
+    return float(sigmaTimePacked) * (1.f / scaleSigmaTimePacked);
+  }
   GPUd() void setSigmaTime(float sigmaTime)
   {
     uint32_t tmp = sigmaTime * scaleSigmaTimePacked + 0.5;
@@ -138,6 +155,31 @@ struct ClusterNative {
     sigmaPadPacked = tmp;
   }
 
+  GPUd() bool isSaturated() const { return qMax >= 1023; }
+
+  GPUd() void setSaturatedQtot(uint32_t qtot)
+  {
+    if (qtot > maxSaturatedQTot) {
+      qtot = maxSaturatedQTot;
+    }
+    this->qTot = (qtot + scaleSaturatedQTot / 2) / scaleSaturatedQTot;
+  }
+
+  GPUd() uint32_t getSaturatedQtot() const
+  {
+    return uint32_t(qTot) * scaleSaturatedQTot;
+  }
+
+  GPUd() void setSaturatedTailLength(uint32_t tail)
+  {
+    sigmaTimePacked = encodeTailLength(tail);
+  }
+
+  GPUd() uint32_t getSaturatedTailLength() const
+  {
+    return decodeTailLength(sigmaTimePacked);
+  }
+
   GPUd() bool operator<(const ClusterNative& rhs) const
   {
     if (this->getTimePacked() != rhs.getTimePacked()) {
@@ -167,6 +209,93 @@ struct ClusterNative {
            this->qTot == rhs.qTot &&
            this->getFlags() == rhs.getFlags();
   }
+
+ private:
+  static constexpr GPUd() uint32_t decodeTailLength(uint8_t code)
+  {
+    // Quantize tail length into 8bits.
+    // Max expected length is 1500 tbs.
+    // But allow outliers up to 8000 tbs.
+    //
+    // Full code layout is:
+    //
+    // | Code range | Decoded values |  Step | Codes |
+    // | ---------: | -------------: | ----: | ----: |
+    // |    `0..63` |        `0..63` |   `1` |  `64` |
+    // |   `64..95` |      `64..126` |   `2` |  `32` |
+    // |  `96..127` |     `128..252` |   `4` |  `32` |
+    // | `128..159` |     `256..504` |   `8` |  `32` |
+    // | `160..223` |    `512..1520` |  `16` |  `64` |
+    // | `224..239` |   `1552..2032` |  `32` |  `16` |
+    // | `240..255` |   `2048..8048` | `400` |  `16` |
+    //
+
+    if (code < 64) {
+      return code;
+    }
+
+    if (code < 160) {
+      uint32_t q = (uint32_t)code - 64u;
+      uint32_t exponent = (q >> 5) + 1u; // 1, 2, 3
+      uint32_t mantissa = q & 31u;       // 0..31
+
+      return (32u + mantissa) << exponent;
+    }
+
+    if (code < 224) {
+      return 512u + 16u * ((uint32_t)code - 160u);
+    }
+
+    if (code < 240) {
+      return 1552u + 32u * ((uint32_t)code - 224u);
+    }
+
+    return 2048u + 400u * ((uint32_t)code - 240u);
+  }
+
+  static constexpr GPUd() uint8_t encodeTailLength(uint32_t value)
+  {
+    // Saturate above representable range.
+    if (value >= decodeTailLength(255)) [[unlikely]] {
+      return 255;
+    }
+
+    // Binary search for the first code whose decoded value >= value.
+    uint8_t lo = 0;
+    uint8_t hi = 255;
+
+    while (lo < hi) {
+      uint8_t mid = lo + ((hi - lo) >> 1);
+      uint32_t decoded = decodeTailLength(mid);
+
+      if (decoded < value) {
+        lo = mid + 1;
+      } else {
+        hi = mid;
+      }
+    }
+
+    // lo is now the first code with decoded >= value.
+    if (lo == 0) [[unlikely]] {
+      return 0;
+    }
+
+    uint8_t above_code = lo;
+    uint8_t below_code = lo - 1;
+
+    uint32_t above_value = decodeTailLength(above_code);
+    uint32_t below_value = decodeTailLength(below_code);
+
+    uint32_t above_error = above_value - value;
+    uint32_t below_error = value - below_value;
+
+    // Tie-break downward.
+    if (below_error <= above_error) {
+      return below_code;
+    } else {
+      return above_code;
+    }
+  }
 };
 
 // This is an index struct to access TPC clusters inside sectors and rows. It shall not own the data, but just point to

@@ -128,9 +128,11 @@ void GPUTPCClusterStatistics::RunStatistics(const o2::tpc::ClusterNativeAccess*
           tmpClusters[k] = clustersNative->clusters[i][j][k];
           if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
             GPUTPCCompression::truncateSignificantBitsChargeMax(tmpClusters[k].qMax, param);
-            GPUTPCCompression::truncateSignificantBitsCharge(tmpClusters[k].qTot, param);
             GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaPadPacked, param);
-            GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaTimePacked, param);
+            if (!tmpClusters[k].isSaturated()) {
+              GPUTPCCompression::truncateSignificantBitsCharge(tmpClusters[k].qTot, param);
+              GPUTPCCompression::truncateSignificantBitsWidth(tmpClusters[k].sigmaTimePacked, param);
+            }
           }
         }
         std::sort(tmpClusters.begin(), tmpClusters.end());

@@ -47,7 +47,7 @@ class GPUTPCCompression : public GPUProcessor
 #endif
 
   static constexpr uint32_t P_MAX_QMAX = 1 << 10;
-  static constexpr uint32_t P_MAX_QTOT = 5 * 5 * P_MAX_QMAX;
+  static constexpr uint32_t P_MAX_QTOT = 1 << 16;
   static constexpr uint32_t P_MAX_TIME = 1 << 24;
   static constexpr uint32_t P_MAX_PAD = 1 << 16;
   static constexpr uint32_t P_MAX_SIGMA = 1 << 8;

@@ -121,9 +121,11 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step0at
       uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;
       if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
         compressor.truncateSignificantBitsChargeMax(qmax, param);
-        compressor.truncateSignificantBitsCharge(qtot, param);
         compressor.truncateSignificantBitsWidth(sigmapad, param);
-        compressor.truncateSignificantBitsWidth(sigmatime, param);
+        if (!orgCl.isSaturated()) {
+          compressor.truncateSignificantBitsCharge(qtot, param);
+          compressor.truncateSignificantBitsWidth(sigmatime, param);
+        }
       }
       c.qTotA[cidx] = qtot;
       c.qMaxA[cidx] = qmax;
@@ -298,9 +300,11 @@ GPUdii() void GPUTPCCompressionKernels::Thread<GPUTPCCompressionKernels::step1un
         uint8_t sigmapad = orgCl.sigmaPadPacked, sigmatime = orgCl.sigmaTimePacked;
         if (param.rec.tpc.compressionTypeMask & GPUSettings::CompressionTruncate) {
           compressor.truncateSignificantBitsChargeMax(qmax, param);
-          compressor.truncateSignificantBitsCharge(qtot, param);
           compressor.truncateSignificantBitsWidth(sigmapad, param);
-          compressor.truncateSignificantBitsWidth(sigmatime, param);
+          if (!orgCl.isSaturated()) {
+            compressor.truncateSignificantBitsCharge(qtot, param);
+            compressor.truncateSignificantBitsWidth(sigmatime, param);
+          }
         }
         c.qTotU[outidx] = qtot;
         c.qMaxU[outidx] = qmax;

@@ -529,11 +529,12 @@ GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
     const float firstWeight = tail->qTot;
     const float firstPad = tail->pad;
     const float firstTime = HIPTailTimeMean(*tail);
-    const float firstTimeVariance = HIPTailTimeVariance(*tail);
     float padSum = firstWeight * firstPad;
     float padSqSum = firstWeight * firstPad * firstPad;
     float timeSum = firstWeight * firstTime;
-    float timeSqSum = firstWeight * (firstTime * firstTime + firstTimeVariance);
+
+    uint32_t tailStart = tail->tailStart;
+    uint32_t tailEnd = tail->tailEnd;
 
     while (tail->iNext != 0) {
 
@@ -542,28 +543,27 @@ GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
       const float tailWeight = tail->qTot;
       const float tailPad = tail->pad;
       const float tailTime = HIPTailTimeMean(*tail);
-      const float tailTimeVariance = HIPTailTimeVariance(*tail);
       qMax = CAMath::Max(qMax, tail->qMax);
       qTot += tail->qTot;
       padSum += tailWeight * tailPad;
       padSqSum += tailWeight * tailPad * tailPad;
       timeSum += tailWeight * tailTime;
-      timeSqSum += tailWeight * (tailTime * tailTime + tailTimeVariance);
+      tailStart = CAMath::Min<uint32_t>(tailStart, tail->tailStart);
+      tailEnd = CAMath::Max<uint32_t>(tailEnd, tail->tailEnd);
     }
 
     const float weightSum = CAMath::Max(qTot, 1.f);
     float padMean = padSum / weightSum;
     float timeMean = timeSum / weightSum; // TODO: Use timebin of saturated signal instead! Time mean is biased for long tails.
     float padSigma = CAMath::Sqrt(CAMath::Max(0.f, padSqSum / weightSum - padMean * padMean));
-    float timeSigma = CAMath::Sqrt(CAMath::Max(0.f, timeSqSum / weightSum - timeMean * timeMean));
 
     tpc::ClusterNative cn;
     cn.qMax = qMax;
-    cn.qTot = (uint16_t)CAMath::Min(qTot, 65535.f);
+    cn.setSaturatedQtot(qTot);
+    cn.setSaturatedTailLength(tailEnd - tailStart);
     float clusterTime = fragment.start + timeMean - clusterer.Param().rec.tpc.clustersShiftTimebinsClusterizer;
     cn.setTimeFlags(clusterTime, 0);
     cn.setPad(padMean);
-    cn.setSigmaTime(timeSigma);
     cn.setSigmaPad(padSigma);
 
     if (cn.qMax >= 1023) {

@@ -166,7 +166,13 @@ void GPUTPCClusterFinder::DumpClusters(std::ostream& out)
 
     out << "Row: " << i << ": " << N << "\n";
     for (const auto& cl : sortedCluster) {
-      out << std::hex << cl.timeFlagsPacked << std::dec << " " << cl.padPacked << " " << int32_t{cl.sigmaTimePacked} << " " << int32_t{cl.sigmaPadPacked} << " " << cl.qMax << " " << cl.qTot << "\n";
+      uint32_t qTot = cl.qTot;
+      uint32_t sigmaTime = cl.sigmaTimePacked;
+      if (cl.isSaturated()) {
+        qTot = cl.getSaturatedQtot();
+        sigmaTime = cl.getSaturatedTailLength();
+      }
+      out << std::hex << cl.timeFlagsPacked << std::dec << " " << cl.padPacked << " " << sigmaTime << " " << int32_t{cl.sigmaPadPacked} << " " << cl.qMax << " " << qTot << "\n";
     }
   }
 }