Detect out of order messages

mkuratczyk · mkuratczyk · commit 84717f2ea234 · 2024-08-06T11:28:25.000+02:00
* compare the message timestamp with the previous message's timestamp
* log if older
* new metric for out of order messages
* print priority when a message received
diff --git a/cmd/cmd_test.go b/cmd/cmd_test.go
@@ -124,7 +124,7 @@ func TestLatencyCalculationNano(t *testing.T) {
 	testMsg := utils.MessageBody(100)
 	utils.UpdatePayload(false, &testMsg)
 	time.Sleep(1 * time.Microsecond)
-	latency := utils.CalculateEndToEndLatency(false, &testMsg)
+	_, latency := utils.CalculateEndToEndLatency(false, &testMsg)
 	// not very precise but we just care about the order of magnitude
 	assert.Greater(t, latency, 0.000001)
 	assert.Less(t, latency, 0.001)
@@ -134,7 +134,7 @@ func TestLatencyCalculationMillis(t *testing.T) {
 	testMsg := utils.MessageBody(100)
 	utils.UpdatePayload(true, &testMsg)
 	time.Sleep(2 * time.Millisecond)
-	latency := utils.CalculateEndToEndLatency(true, &testMsg)
+	_, latency := utils.CalculateEndToEndLatency(true, &testMsg)
 	// not very precise but we just care about the order of magnitude
 	assert.Greater(t, latency, 0.001)
 	assert.Less(t, latency, 0.010)
@@ -170,7 +170,7 @@ func BenchmarkLatencyCalculation(b *testing.B) {
 	utils.UpdatePayload(false, &testMsg)
 
 	for i := 0; i < b.N; i++ {
-		_ = utils.CalculateEndToEndLatency(false, &testMsg)
+		_, _ = utils.CalculateEndToEndLatency(false, &testMsg)
 	}
 }
 
@@ -192,7 +192,8 @@ func BenchmarkObservingLatency(b *testing.B) {
 
 	for i := 0; i < b.N; i++ {
 		utils.UpdatePayload(false, &testMsg)
-		metric.With(prometheus.Labels{"protocol": "foo"}).Observe(utils.CalculateEndToEndLatency(false, &testMsg))
+		_, latency := utils.CalculateEndToEndLatency(false, &testMsg)
+		metric.With(prometheus.Labels{"protocol": "foo"}).Observe(latency)
 	}
 }
 
@@ -211,6 +212,7 @@ func BenchmarkObservingLatencyMillis(b *testing.B) {
 
 	for i := 0; i < b.N; i++ {
 		utils.UpdatePayload(true, &testMsg)
-		metric.With(prometheus.Labels{"protocol": "foo"}).Observe(utils.CalculateEndToEndLatency(false, &testMsg))
+		_, latency := utils.CalculateEndToEndLatency(false, &testMsg)
+		metric.With(prometheus.Labels{"protocol": "foo"}).Observe(latency)
 	}
 }
diff --git a/pkg/amqp10_client/consumer.go b/pkg/amqp10_client/consumer.go
@@ -80,6 +80,7 @@ func (c Amqp10Consumer) Start(ctx context.Context, subscribed chan bool) {
 	m := metrics.EndToEndLatency
 
 	log.Info("consumer started", "protocol", "amqp-1.0", "consumerId", c.Id, "terminus", c.Topic)
+	previousMessageTimeSent := time.Unix(0, 0)
 
 	for i := 1; i <= c.Config.ConsumeCount; i++ {
 		select {
@@ -95,12 +96,22 @@ func (c Amqp10Consumer) Start(ctx context.Context, subscribed chan bool) {
 
 			payload := msg.GetData()
 			priority := strconv.Itoa(int(msg.Header.Priority))
-			m.With(prometheus.Labels{"protocol": "amqp-1.0"}).Observe(utils.CalculateEndToEndLatency(c.Config.UseMillis, &payload))
+			timeSent, latency := utils.CalculateEndToEndLatency(c.Config.UseMillis, &payload)
+			m.With(prometheus.Labels{"protocol": "amqp-1.0"}).Observe(latency)
 
-			log.Debug("message received", "protocol", "amqp-1.0", "consumerId", c.Id, "terminus", c.Topic, "size", len(payload))
+			if timeSent.Before(previousMessageTimeSent) {
+				metrics.MessagesConsumedOutOfOrder.With(prometheus.Labels{"protocol": "amqp-1.0", "priority": priority}).Inc()
+				log.Info("Out of order message received. This message was sent before the previous message", "this messsage",timeSent, "previous message", previousMessageTimeSent)
+			}
+			previousMessageTimeSent = timeSent
+
+			log.Debug("message received", "protocol", "amqp-1.0", "consumerId", c.Id, "terminus", c.Topic, "size", len(payload), "priority", priority, "sent", timeSent)
+
+			if c.Config.ConsumerLatency > 0 {
+				log.Debug("consumer latency", "protocol", "amqp-1.0", "consumerId", c.Id, "latency", c.Config.ConsumerLatency)
+				time.Sleep(c.Config.ConsumerLatency)
+			}
 
-			log.Debug("consumer latency", "protocol", "amqp-1.0", "consumerId", c.Id, "latency", c.Config.ConsumerLatency)
-			time.Sleep(c.Config.ConsumerLatency)
 			err = receiver.AcceptMessage(ctx, msg)
 			if err != nil {
 				log.Error("message NOT accepted", "protocol", "amqp-1.0", "consumerId", c.Id, "terminus", c.Topic)
diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go
@@ -30,6 +30,7 @@ var metricsServer *MetricsServer
 var (
 	MessagesPublished *prometheus.CounterVec
 	MessagesConsumed  *prometheus.CounterVec
+	MessagesConsumedOutOfOrder  *prometheus.CounterVec
 	PublishingLatency *prometheus.SummaryVec
 	EndToEndLatency   *prometheus.HistogramVec
 )
@@ -49,6 +50,13 @@ func RegisterMetrics(globalLabels prometheus.Labels) {
 			ConstLabels: globalLabels,
 		}, []string{"protocol", "priority"})
 	}
+	if MessagesConsumedOutOfOrder == nil {
+		MessagesConsumedOutOfOrder = promauto.NewCounterVec(prometheus.CounterOpts{
+			Name:        "omq_messages_consumed_out_of_order",
+			Help:        "The number of messages consumed out of order",
+			ConstLabels: globalLabels,
+		}, []string{"protocol", "priority"})
+	}
 	if PublishingLatency == nil {
 		PublishingLatency = promauto.NewSummaryVec(prometheus.SummaryOpts{
 			Name:        "omq_publishing_latency_seconds",
diff --git a/pkg/mqtt_client/consumer.go b/pkg/mqtt_client/consumer.go
@@ -59,10 +59,20 @@ func (c MqttConsumer) Start(ctx context.Context, subscribed chan bool) {
 
 	msgsReceived := 0
 
+	previousMessageTimeSent := time.Unix(0, 0)
+
 	handler := func(client mqtt.Client, msg mqtt.Message) {
 		metrics.MessagesConsumed.With(prometheus.Labels{"protocol": "mqtt", "priority": ""}).Inc()
 		payload := msg.Payload()
-		m.Observe(utils.CalculateEndToEndLatency(c.Config.UseMillis, &payload))
+		timeSent, latency := utils.CalculateEndToEndLatency(c.Config.UseMillis, &payload)
+		m.Observe(latency)
+
+		if timeSent.Before(previousMessageTimeSent) {
+			metrics.MessagesConsumedOutOfOrder.With(prometheus.Labels{"protocol": "mqtt"}).Inc()
+			log.Info("Out of order message received. This message was sent before the previous message", "this messsage", timeSent, "previous message", previousMessageTimeSent)
+		}
+		previousMessageTimeSent = timeSent
+
 		msgsReceived++
 		log.Debug("message received", "protocol", "mqtt", "consumerId", c.Id, "topic", c.Topic, "size", len(payload))
 	}
diff --git a/pkg/stomp_client/consumer.go b/pkg/stomp_client/consumer.go
@@ -62,25 +62,42 @@ func (c StompConsumer) Start(ctx context.Context, subscribed chan bool) {
 	close(subscribed)
 
 	m := metrics.EndToEndLatency.With(prometheus.Labels{"protocol": "stomp"})
+
 	log.Info("consumer started", "protocol", "STOMP", "consumerId", c.Id, "destination", c.Topic)
+	previousMessageTimeSent := time.Unix(0, 0)
+
 	for i := 1; i <= c.Config.ConsumeCount; i++ {
 		select {
 		case msg := <-sub.C:
 			if msg.Err != nil {
 				log.Error("failed to receive a message", "protocol", "STOMP", "consumerId", c.Id, "c.Topic", c.Topic, "error", msg.Err)
 				return
 			}
-			m.Observe(utils.CalculateEndToEndLatency(c.Config.UseMillis, &msg.Body))
-			log.Debug("message received", "protocol", "stomp", "consumerId", c.Id, "destination", c.Topic, "size", len(msg.Body), "ack required", msg.ShouldAck())
 
-			log.Debug("consumer latency", "protocol", "stomp", "consumerId", c.Id, "latency", c.Config.ConsumerLatency)
-			time.Sleep(c.Config.ConsumerLatency)
+			timeSent, latency := utils.CalculateEndToEndLatency(c.Config.UseMillis, &msg.Body)
+			m.Observe(latency)
+
+			priority := msg.Header.Get("priority")
+
+			if timeSent.Before(previousMessageTimeSent) {
+				metrics.MessagesConsumedOutOfOrder.With(prometheus.Labels{"protocol": "amqp-1.0", "priority": priority}).Inc()
+				log.Info("Out of order message received. This message was sent before the previous message", "this messsage", timeSent, "previous message", previousMessageTimeSent)
+			}
+			previousMessageTimeSent = timeSent
+
+			log.Debug("message received", "protocol", "stomp", "consumerId", c.Id, "destination", c.Topic, "size", len(msg.Body), "ack required", msg.ShouldAck(), "priority", priority, "sent", timeSent)
+
+			if c.Config.ConsumerLatency > 0 {
+				log.Debug("consumer latency", "protocol", "stomp", "consumerId", c.Id, "latency", c.Config.ConsumerLatency)
+				time.Sleep(c.Config.ConsumerLatency)
+			}
+
 			err = c.Connection.Ack(msg)
 			if err != nil {
 				log.Error("message NOT acknowledged", "protocol", "stomp", "consumerId", c.Id, "destination", c.Topic)
 
 			}
-			metrics.MessagesConsumed.With(prometheus.Labels{"protocol": "stomp", "priority": msg.Header.Get("priority")}).Inc()
+			metrics.MessagesConsumed.With(prometheus.Labels{"protocol": "stomp", "priority": priority}).Inc()
 		case <-ctx.Done():
 			c.Stop("time limit reached")
 			return
diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go
@@ -20,22 +20,33 @@ func UpdatePayload(useMillis bool, payload *[]byte) *[]byte {
 	return payload
 }
 
-func CalculateEndToEndLatency(useMillis bool, payload *[]byte) float64 {
+func CalculateEndToEndLatency(useMillis bool, payload *[]byte) (time.Time, float64) {
 	if len(*payload) < 12 {
 		// message sent without latency tracking
-		return 0
+		return time.Unix(0, 0), 0
 	}
 	timeSent := binary.BigEndian.Uint64((*payload)[4:])
 
 	if useMillis {
 		// less precise but necessary when a different process publishes and consumes
 		now := uint64(time.Now().UnixMilli())
 		latency := now - timeSent
-		return (float64(latency) / 1000)
+		return FormatTimestamp(timeSent), (float64(latency) / 1000)
 	} else {
 		// nanoseconds - more precise when the same process publishes and consumes
 		now := uint64(time.Now().UnixNano())
 		latency := now - timeSent
-		return (float64(latency) / 1000000000)
+		return FormatTimestamp(timeSent), (float64(latency) / 1000000000)
 	}
 }
+
+func FormatTimestamp(timestamp uint64) time.Time {
+	var t time.Time
+	// should be updated before the year 2100 ;)
+	if timestamp < 4102441200000 {
+		t = time.UnixMilli(int64(timestamp))
+	} else {
+		t = time.Unix(0, int64(timestamp))
+	}
+	return t
+}