feat: XChaCha20 voice encryption (#1242)

brainboxdotcc · Sep 21, 2024 · a797675 · a797675
2 parents b90d9aa + a0f5bc3
commit a797675
Show file tree

Hide file tree

Showing 2 changed files with 193 additions and 128 deletions.
diff --git a/include/dpp/discordvoiceclient.h b/include/dpp/discordvoiceclient.h
@@ -346,6 +346,17 @@ class DPP_EXPORT discord_voice_client : public websocket_client
 	 */
 	uint32_t timestamp;
 
+	/**
+	 * @brief Each packet should have a nonce, a 32-bit incremental
+	 * integer value appended to payload.
+	 *
+	 * We should keep track of this value and increment it for each
+	 * packet sent.
+	 *
+	 * Current initial value is hardcoded to 1.
+	 */
+	uint32_t packet_nonce;
+
 	/**
 	 * @brief Last sent packet high-resolution timestamp
 	 */

diff --git a/src/dpp/discordvoiceclient.cpp b/src/dpp/discordvoiceclient.cpp
@@ -20,6 +20,7 @@
  *
  ************************************************************************************/
 
+#include <cstdint>
 #include <dpp/export.h>
 #ifdef _WIN32
 	#include <WinSock2.h>
@@ -316,6 +317,7 @@ discord_voice_client::discord_voice_client(dpp::cluster* _cluster, snowflake _ch
 	secret_key(nullptr),
 	sequence(0),
 	timestamp(0),
+	packet_nonce(1),
 	last_timestamp(std::chrono::high_resolution_clock::now()),
 	sending(false),
 	tracks(0),
@@ -593,6 +595,9 @@ bool discord_voice_client::handle_frame(const std::string &data)
 					rdy.voice_channel_id = this->channel_id;
 					creator->on_voice_ready.call(rdy);
 				}
+
+				/* Reset packet_nonce */
+				packet_nonce = 1;
 			}
 			break;
 			/* Voice ready */
@@ -711,131 +716,157 @@ void discord_voice_client::read_ready()
 	uint8_t buffer[65535];
 	int packet_size = this->udp_recv((char*)buffer, sizeof(buffer));
 
-	if (packet_size > 0 && (!creator->on_voice_receive.empty() || !creator->on_voice_receive_combined.empty())) {
-		constexpr size_t header_size = 12;
-		if (static_cast<size_t>(packet_size) < header_size) {
-			/* Invalid RTP payload */
-			return;
-		}
+	bool receive_handler_is_empty = creator->on_voice_receive.empty() && creator->on_voice_receive_combined.empty();
+	if (packet_size <= 0 || receive_handler_is_empty) {
+		/* Nothing to do */
+		return;
+	}
 
-		/* It's a "silence packet" - throw it away. */
-		if (packet_size < 44) {
-			return;
-		}
+	constexpr size_t header_size = 12;
+	if (static_cast<size_t>(packet_size) < header_size) {
+		/* Invalid RTP payload */
+		return;
+	}
 
-		if (uint8_t payload_type = buffer[1] & 0b0111'1111;
-		    72 <= payload_type && payload_type <= 76) {
-			/*
-			 * This is an RTCP payload. Discord is known to send
-			 * RTCP Receiver Reports.
-			 *
-			 * See https://datatracker.ietf.org/doc/html/rfc3551#section-6
-			 */
-			return;
-		}
+	/* It's a "silence packet" - throw it away. */
+	if (packet_size < 44) {
+		return;
+	}
 
-		voice_payload vp{0, // seq, populate later
-		                 0, // timestamp, populate later
-		                 std::make_unique<voice_receive_t>(nullptr, std::string((char*)buffer, packet_size))};
+	if (uint8_t payload_type = buffer[1] & 0b0111'1111;
+		72 <= payload_type && payload_type <= 76) {
+		/*
+		 * This is an RTCP payload. Discord is known to send
+		 * RTCP Receiver Reports.
+		 *
+		 * See https://datatracker.ietf.org/doc/html/rfc3551#section-6
+		 */
+		return;
+	}
 
-		vp.vr->voice_client = this;
+	voice_payload vp{0, // seq, populate later
+		0, // timestamp, populate later
+		std::make_unique<voice_receive_t>(nullptr, std::string((char*)buffer, packet_size))};
 
-		{	/* Get the User ID of the speaker */
-			uint32_t speaker_ssrc;
-			std::memcpy(&speaker_ssrc, &buffer[8], sizeof(uint32_t));
-			speaker_ssrc = ntohl(speaker_ssrc);
-			vp.vr->user_id = ssrc_map[speaker_ssrc];
-		}
+	vp.vr->voice_client = this;
 
-		/* Get the sequence number of the voice UDP packet */
-		std::memcpy(&vp.seq, &buffer[2], sizeof(rtp_seq_t));
-		vp.seq = ntohs(vp.seq);
-		/* Get the timestamp of the voice UDP packet */
-		std::memcpy(&vp.timestamp, &buffer[4], sizeof(rtp_timestamp_t));
-		vp.timestamp = ntohl(vp.timestamp);
-
-		/* Nonce is the RTP Header with zero padding */
-		uint8_t nonce[24] = { 0 };
-		std::memcpy(nonce, &buffer[0], header_size);
-
-		/* Get the number of CSRC in header */
-		const size_t csrc_count = buffer[0] & 0b0000'1111;
-		/* Skip to the encrypted voice data */
-		const ptrdiff_t offset_to_data = header_size + sizeof(uint32_t) * csrc_count;
-		uint8_t* encrypted_data = buffer + offset_to_data;
-		const size_t encrypted_data_len = packet_size - offset_to_data;
-
-		if(crypto_aead_xchacha20poly1305_ietf_decrypt() != 0)
-
-		if (crypto_secretbox_open_easy(encrypted_data, encrypted_data,
-		                               encrypted_data_len, nonce, secret_key)) {
-			/* Invalid Discord RTP payload. */
-			return;
-		}
+	uint32_t speaker_ssrc;
+	{	/* Get the User ID of the speaker */
+		std::memcpy(&speaker_ssrc, &buffer[8], sizeof(uint32_t));
+		speaker_ssrc = ntohl(speaker_ssrc);
+		vp.vr->user_id = ssrc_map[speaker_ssrc];
+	}
 
-                const uint8_t* decrypted_data = encrypted_data;
-                size_t decrypted_data_len = encrypted_data_len - crypto_box_MACBYTES;
-		if ([[maybe_unused]] const bool uses_extension = (buffer[0] >> 4) & 0b0001) {
-			/* Skip the RTP Extensions */
-			size_t ext_len = 0;
-			{
-				uint16_t ext_len_in_words;
-				memcpy(&ext_len_in_words, &decrypted_data[2], sizeof(uint16_t));
-				ext_len_in_words = ntohs(ext_len_in_words);
-				ext_len = sizeof(uint32_t) * ext_len_in_words;
-			}
-			constexpr size_t ext_header_len = sizeof(uint16_t) * 2;
-                        decrypted_data += ext_header_len + ext_len;
-                        decrypted_data_len -= ext_header_len + ext_len;
+	/* Get the sequence number of the voice UDP packet */
+	std::memcpy(&vp.seq, &buffer[2], sizeof(rtp_seq_t));
+	vp.seq = ntohs(vp.seq);
+	/* Get the timestamp of the voice UDP packet */
+	std::memcpy(&vp.timestamp, &buffer[4], sizeof(rtp_timestamp_t));
+	vp.timestamp = ntohl(vp.timestamp);
+
+	constexpr size_t nonce_size = sizeof(uint32_t);
+	/* Nonce is 4 byte at the end of payload with zero padding */
+	uint8_t nonce[24] = { 0 };
+	std::memcpy(nonce, buffer + packet_size - nonce_size, nonce_size);
+
+	/* Get the number of CSRC in header */
+	const size_t csrc_count = buffer[0] & 0b0000'1111;
+	/* Skip to the encrypted voice data */
+	const ptrdiff_t offset_to_data = header_size + sizeof(uint32_t) * csrc_count;
+	size_t total_header_len = offset_to_data;
+
+	uint8_t* ciphertext = buffer + offset_to_data;
+	size_t ciphertext_len = packet_size - offset_to_data - nonce_size;
+
+	size_t ext_len = 0;
+	if ([[maybe_unused]] const bool uses_extension = (buffer[0] >> 4) & 0b0001) {
+		/**
+		 * Get the RTP Extensions size, we only get the size here because
+		 * the extension itself is encrypted along with the opus packet
+		 */
+		{
+			uint16_t ext_len_in_words;
+			memcpy(&ext_len_in_words, &ciphertext[2], sizeof(uint16_t));
+			ext_len_in_words = ntohs(ext_len_in_words);
+			ext_len = sizeof(uint32_t) * ext_len_in_words;
 		}
+		constexpr size_t ext_header_len = sizeof(uint16_t) * 2;
+		ciphertext += ext_header_len;
+		ciphertext_len -= ext_header_len;
+		total_header_len += ext_header_len;
+	}
 
-		/*
-		 * We're left with the decrypted, opus-encoded data.
-		 * Park the payload and decode on the voice courier thread.
+	uint8_t decrypted[65535] = { 0 };
+	unsigned long long opus_packet_len  = 0;
+	if (crypto_aead_xchacha20poly1305_ietf_decrypt(
+		decrypted, &opus_packet_len,
+		nullptr,
+		ciphertext, ciphertext_len,
+		buffer,
+		/**
+		 * Additional Data:
+		 * The whole header (including csrc list) +
+		 * 4 byte extension header (magic 0xBEDE + 16-bit denoting extension length)
 		 */
-		vp.vr->audio_data.assign(decrypted_data, decrypted_data + decrypted_data_len);
+		total_header_len,
+		nonce, secret_key) != 0) {
+		/* Invalid Discord RTP payload. */
+		return;
+	}
 
-		{
-			std::lock_guard lk(voice_courier_shared_state.mtx);
-			auto& [range, payload_queue, pending_decoder_ctls, decoder] = voice_courier_shared_state.parked_voice_payloads[vp.vr->user_id];
+	uint8_t *opus_packet = decrypted;
+	if (ext_len > 0) {
+		/* Skip previously encrypted RTP Header Extension */
+		opus_packet += ext_len;
+		opus_packet_len -= ext_len;
+	}
 
-			if (!decoder) {
-				/*
-				 * Most likely this is the first time we encounter this speaker.
-				 * Do some initialization for not only the decoder but also the range.
+	/*
+	 * We're left with the decrypted, opus-encoded data.
+	 * Park the payload and decode on the voice courier thread.
+	 */
+	vp.vr->audio_data.assign(opus_packet, opus_packet + opus_packet_len);
+
+	{
+		std::lock_guard lk(voice_courier_shared_state.mtx);
+		auto& [range, payload_queue, pending_decoder_ctls, decoder] = voice_courier_shared_state.parked_voice_payloads[vp.vr->user_id];
+
+		if (!decoder) {
+			/*
+			 * Most likely this is the first time we encounter this speaker.
+			 * Do some initialization for not only the decoder but also the range.
+			 */
+			range.min_seq = vp.seq;
+			range.min_timestamp = vp.timestamp;
+
+			int opus_error = 0;
+			decoder.reset(opus_decoder_create(opus_sample_rate_hz, opus_channel_count, &opus_error),
+				 &opus_decoder_destroy);
+			if (opus_error) {
+				/**
+				 * NOTE: The -10 here makes the opus_error match up with values of exception_error_code,
+				 * which would otherwise conflict as every C library loves to use values from -1 downwards.
 				 */
-				range.min_seq = vp.seq;
-				range.min_timestamp = vp.timestamp;
-
-				int opus_error = 0;
-				decoder.reset(opus_decoder_create(opus_sample_rate_hz, opus_channel_count, &opus_error),
-				              &opus_decoder_destroy);
-				if (opus_error) {
-					/**
-					 * NOTE: The -10 here makes the opus_error match up with values of exception_error_code,
-					 * which would otherwise conflict as every C library loves to use values from -1 downwards.
-					 */
-					throw dpp::voice_exception((exception_error_code)(opus_error - 10), "discord_voice_client::discord_voice_client; opus_decoder_create() failed");
-				}
+				throw dpp::voice_exception((exception_error_code)(opus_error - 10), "discord_voice_client::discord_voice_client; opus_decoder_create() failed");
 			}
+		}
 
-			if (vp.seq < range.min_seq && vp.timestamp < range.min_timestamp) {
-				/* This packet arrived too late. We can only discard it. */
-				return;
-			}
-			range.max_seq = vp.seq;
-			range.max_timestamp = vp.timestamp;
-			payload_queue.push(std::move(vp));
+		if (vp.seq < range.min_seq && vp.timestamp < range.min_timestamp) {
+			/* This packet arrived too late. We can only discard it. */
+			return;
 		}
+		range.max_seq = vp.seq;
+		range.max_timestamp = vp.timestamp;
+		payload_queue.push(std::move(vp));
+	}
 
-		voice_courier_shared_state.signal_iteration.notify_one();
+	voice_courier_shared_state.signal_iteration.notify_one();
 
-		if (!voice_courier.joinable()) {
-			/* Courier thread is not running, start it */
-			voice_courier = std::thread(&voice_courier_loop,
-			                            std::ref(*this),
-			                            std::ref(voice_courier_shared_state));
-		}
+	if (!voice_courier.joinable()) {
+		/* Courier thread is not running, start it */
+		voice_courier = std::thread(&voice_courier_loop,
+							  std::ref(*this),
+							  std::ref(voice_courier_shared_state));
 	}
 #else
 	throw dpp::voice_exception(err_no_voice_support, "Voice support not enabled in this build of D++");
@@ -1244,13 +1275,13 @@ discord_voice_client& discord_voice_client::send_audio_raw(uint16_t* audio_data,
 		return send_audio_raw((uint16_t*)packet.data(), packet.size());
 	}
 
-	opus_int32 encodedAudioMaxLength = (opus_int32)length;
-	std::vector<uint8_t> encodedAudioData(encodedAudioMaxLength);
-	size_t encodedAudioLength = encodedAudioMaxLength;
+	opus_int32 encoded_audio_max_length = (opus_int32)length;
+	std::vector<uint8_t> encoded_audio(encoded_audio_max_length);
+	size_t encoded_audio_length = encoded_audio_max_length;
 
-	encodedAudioLength = this->encode((uint8_t*)audio_data, length, encodedAudioData.data(), encodedAudioLength);
+	encoded_audio_length = this->encode((uint8_t*)audio_data, length, encoded_audio.data(), encoded_audio_length);
 
-	send_audio_opus(encodedAudioData.data(), encodedAudioLength);
+	send_audio_opus(encoded_audio.data(), encoded_audio_length);
 #else
 	throw dpp::voice_exception(err_no_voice_support, "Voice support not enabled in this build of D++");
 #endif
@@ -1270,31 +1301,54 @@ discord_voice_client& discord_voice_client::send_audio_opus(uint8_t* opus_packet
 
 discord_voice_client& discord_voice_client::send_audio_opus(uint8_t* opus_packet, const size_t length, uint64_t duration) {
 #if HAVE_VOICE
-	int frameSize = (int)(48 * duration * (timescale / 1000000));
-	opus_int32 encodedAudioMaxLength = (opus_int32)length;
-	std::vector<uint8_t> encodedAudioData(encodedAudioMaxLength);
-	size_t encodedAudioLength = encodedAudioMaxLength;
+	int frame_size = (int)(48 * duration * (timescale / 1000000));
+	opus_int32 encoded_audio_max_length = (opus_int32)length;
+	std::vector<uint8_t> encoded_audio(encoded_audio_max_length);
+	size_t encoded_audio_length = encoded_audio_max_length;
 
-	encodedAudioLength = length;
-	encodedAudioData.reserve(length);
-	memcpy(encodedAudioData.data(), opus_packet, length);
+	encoded_audio_length = length;
+	encoded_audio.reserve(length);
+	memcpy(encoded_audio.data(), opus_packet, length);
 
 	++sequence;
 	rtp_header header(sequence, timestamp, (uint32_t)ssrc);
 
-	std::vector<uint8_t> audioDataPacket(sizeof(header) + encodedAudioLength + crypto_secretbox_MACBYTES);
-	std::memcpy(audioDataPacket.data(), &header, sizeof(header));
+	/* Expected payload size is unencrypted header + encrypted opus packet + unencrypted 32 bit nonce */
+	size_t packet_siz = sizeof(header) + (encoded_audio_length + crypto_aead_xchacha20poly1305_IETF_ABYTES) + sizeof(packet_nonce);
+
+	std::vector<uint8_t> payload(packet_siz);
+
+	/* Set RTP header */
+	std::memcpy(payload.data(), &header, sizeof(header));
+
+	/* Convert nonce to big-endian */
+	uint32_t noncel = htonl(packet_nonce);
+
+	/* 24 byte is needed for encrypting, discord just want 4 byte so just fill up the rest with null */
+	unsigned char encrypt_nonce[crypto_aead_xchacha20poly1305_ietf_NPUBBYTES] = { '\0' };
+	memcpy(encrypt_nonce, &noncel, sizeof(noncel));
 
-	unsigned char nonce[crypto_aead_xchacha20poly1305_ietf_NPUBBYTES];
-	randombytes_buf(nonce, sizeof nonce);
+	/* Execute */
+	crypto_aead_xchacha20poly1305_ietf_encrypt(
+			payload.data() + sizeof(header),
+			nullptr,
+			encoded_audio.data(),
+			encoded_audio_length,
+			/* The RTP Header as Additional Data */
+			reinterpret_cast<const unsigned char *>(&header),
+			sizeof(header),
+			nullptr,
+			static_cast<const unsigned char*>(encrypt_nonce),
+			secret_key);
 
-	unsigned long long clen_p;
-	crypto_aead_xchacha20poly1305_ietf_encrypt(audioDataPacket.data() + sizeof(header), &clen_p, encodedAudioData.data(), encodedAudioLength, NULL, NULL, NULL, (const unsigned char*)nonce, secret_key);
+	/* Append the 4 byte nonce to the resulting payload */
+	std::memcpy(payload.data() + payload.size() - sizeof(noncel), &noncel, sizeof(noncel));
 
-	//crypto_secretbox_easy(audioDataPacket.data() + sizeof(header), encodedAudioData.data(), encodedAudioLength, (const unsigned char*)nonce, secret_key);
+	this->send(reinterpret_cast<const char*>(payload.data()), payload.size(), duration);
+	timestamp += frame_size;
 
-	this->send((const char*)audioDataPacket.data(), audioDataPacket.size(), duration);
-	timestamp += frameSize;
+	/* Increment for next packet */
+	packet_nonce++;
 
 	speak();
 #else