DataDog · dd-mergequeue · Sep 24, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/ddtrace/tracer/metrics_test.go b/ddtrace/tracer/metrics_test.go
@@ -61,6 +61,7 @@ func TestReportHealthMetrics(t *testing.T) {
 	assert.Equal(int64(1), counts["datadog.tracer.spans_started"])
 	assert.Equal(int64(1), counts["datadog.tracer.spans_finished"])
 	assert.Equal(int64(0), counts["datadog.tracer.traces_dropped"])
+	assert.Equal(int64(0), counts["datadog.tracer.total_traces_dropped"])
 }
 
 func TestTracerMetrics(t *testing.T) {

diff --git a/ddtrace/tracer/spancontext.go b/ddtrace/tracer/spancontext.go
@@ -266,6 +266,7 @@ type trace struct {
 	priority         *float64          // sampling priority
 	locked           bool              // specifies if the sampling priority can be altered
 	samplingDecision samplingDecision  // samplingDecision indicates whether to send the trace to the agent.
+	dropped          bool              // specifies if part or all of the trace has been dropped
 
 	// root specifies the root of the trace, if known; it is nil when a span
 	// context is extracted from a carrier, at which point there are no spans in
@@ -483,6 +484,8 @@ func (t *trace) finishedOne(s *span) {
 		t.finishChunk(tr, &chunk{
 			spans:    t.spans,
 			willSend: decisionKeep == samplingDecision(atomic.LoadUint32((*uint32)(&t.samplingDecision))),
+			traceID:  s.TraceID,
+			dropped:  t.dropped,
 		})
 		t.spans = nil
 		return
@@ -514,13 +517,16 @@ func (t *trace) finishedOne(s *span) {
 	t.finishChunk(tr, &chunk{
 		spans:    finishedSpans,
 		willSend: decisionKeep == samplingDecision(atomic.LoadUint32((*uint32)(&t.samplingDecision))),
+		traceID:  s.TraceID,
+		dropped:  t.dropped,
 	})
 	t.spans = leftoverSpans
 }
 
 func (t *trace) finishChunk(tr *tracer, ch *chunk) {
 	atomic.AddUint32(&tr.spansFinished, uint32(len(ch.spans)))
 	tr.pushChunk(ch)
+	t.dropped = ch.dropped
 	t.finished = 0 // important, because a buffer can be used for several flushes
 }
 

@@ -81,6 +81,12 @@ type tracer struct {
 	// finished, and dropped
 	spansStarted, spansFinished, tracesDropped uint32
 
+	// Keeps track of the total number of traces dropped for accurate logging.
+	totalTracesDropped struct {
+		mu    sync.Mutex
+		count uint32
+	}
+
 	// Records the number of dropped P0 traces and spans.
 	droppedP0Traces, droppedP0Spans uint32
 
@@ -401,6 +407,12 @@ func (t *tracer) worker(tick <-chan time.Time) {
 			}
 			return
 		}
+
+		t.totalTracesDropped.mu.Lock()
+		if t.totalTracesDropped.count > 0 {
+			log.Error("%d traces dropped through payload queue", t.totalTracesDropped.count)
+		}
+		t.totalTracesDropped.mu.Unlock()
 	}
 }
 
@@ -410,6 +422,8 @@ func (t *tracer) worker(tick <-chan time.Time) {
 type chunk struct {
 	spans    []*span
 	willSend bool // willSend indicates whether the trace will be sent to the agent.
+	traceID  uint64
+	dropped  bool // indicates if the parent trace has already been dropped
 }
 
 // sampleChunk applies single-span sampling to the provided trace.
@@ -451,7 +465,13 @@ func (t *tracer) pushChunk(trace *chunk) {
 	select {
 	case t.out <- trace:
 	default:
-		log.Error("payload queue full, dropping %d traces", len(trace.spans))
+		log.Debug("payload queue full, trace %d dropped %d spans", trace.traceID, len(trace.spans))
+		if !trace.dropped {
+			t.totalTracesDropped.mu.Lock()
+			defer t.totalTracesDropped.mu.Unlock()
+			t.totalTracesDropped.count += 1
+			trace.dropped = true
+		}
 	}
 }
 

diff --git a/ddtrace/tracer/tracer_test.go b/ddtrace/tracer/tracer_test.go
@@ -1591,11 +1591,11 @@ func TestPushPayload(t *testing.T) {
 	s.Meta["key"] = strings.Repeat("X", payloadSizeLimit/2+10)
 
 	// half payload size reached
-	tracer.pushChunk(&chunk{[]*span{s}, true})
+	tracer.pushChunk(&chunk{[]*span{s}, true, s.TraceID, false})
 	tracer.awaitPayload(t, 1)
 
 	// payload size exceeded
-	tracer.pushChunk(&chunk{[]*span{s}, true})
+	tracer.pushChunk(&chunk{[]*span{s}, true, s.TraceID, false})
 	flush(2)
 }
 
@@ -1632,6 +1632,27 @@ func TestPushTrace(t *testing.T) {
 	assert.Len(tracer.out, payloadQueueSize)
 	log.Flush()
 	assert.True(len(tp.Logs()) >= 1)
+
+	t.Run("overload", func(t *testing.T) {
+		tracer := newUnstartedTracer()
+		defer tracer.statsd.Close()
+
+		root := newSpan("name", "service", "resource", 0, 0, 0)
+		trace := root.context.trace
+
+		many := payloadQueueSize + 2
+		for i := 0; i < many; i++ {
+			trace.mu.Lock()
+			c := chunk{spans: make([]*span, i),
+				traceID: trace.root.TraceID,
+				dropped: trace.dropped,
+			}
+			tracer.pushChunk(&c)
+			trace.dropped = c.dropped
+			trace.mu.Unlock()
+		}
+		assert.Equal(uint32(1), tracer.totalTracesDropped.count)
+	})
 }
 
 func TestTracerFlush(t *testing.T) {