Skip to content

Commit

Permalink
feat: XChaCha20 voice encryption (#1242)
Browse files Browse the repository at this point in the history
  • Loading branch information
braindigitalis authored Sep 21, 2024
2 parents b90d9aa + a0f5bc3 commit a797675
Show file tree
Hide file tree
Showing 2 changed files with 193 additions and 128 deletions.
11 changes: 11 additions & 0 deletions include/dpp/discordvoiceclient.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,17 @@ class DPP_EXPORT discord_voice_client : public websocket_client
*/
uint32_t timestamp;

/**
* @brief Each packet should have a nonce, a 32-bit incremental
* integer value appended to payload.
*
* We should keep track of this value and increment it for each
* packet sent.
*
* Current initial value is hardcoded to 1.
*/
uint32_t packet_nonce;

/**
* @brief Last sent packet high-resolution timestamp
*/
Expand Down
310 changes: 182 additions & 128 deletions src/dpp/discordvoiceclient.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*
************************************************************************************/

#include <cstdint>
#include <dpp/export.h>
#ifdef _WIN32
#include <WinSock2.h>
Expand Down Expand Up @@ -316,6 +317,7 @@ discord_voice_client::discord_voice_client(dpp::cluster* _cluster, snowflake _ch
secret_key(nullptr),
sequence(0),
timestamp(0),
packet_nonce(1),
last_timestamp(std::chrono::high_resolution_clock::now()),
sending(false),
tracks(0),
Expand Down Expand Up @@ -593,6 +595,9 @@ bool discord_voice_client::handle_frame(const std::string &data)
rdy.voice_channel_id = this->channel_id;
creator->on_voice_ready.call(rdy);
}

/* Reset packet_nonce */
packet_nonce = 1;
}
break;
/* Voice ready */
Expand Down Expand Up @@ -711,131 +716,157 @@ void discord_voice_client::read_ready()
uint8_t buffer[65535];
int packet_size = this->udp_recv((char*)buffer, sizeof(buffer));

if (packet_size > 0 && (!creator->on_voice_receive.empty() || !creator->on_voice_receive_combined.empty())) {
constexpr size_t header_size = 12;
if (static_cast<size_t>(packet_size) < header_size) {
/* Invalid RTP payload */
return;
}
bool receive_handler_is_empty = creator->on_voice_receive.empty() && creator->on_voice_receive_combined.empty();
if (packet_size <= 0 || receive_handler_is_empty) {
/* Nothing to do */
return;
}

/* It's a "silence packet" - throw it away. */
if (packet_size < 44) {
return;
}
constexpr size_t header_size = 12;
if (static_cast<size_t>(packet_size) < header_size) {
/* Invalid RTP payload */
return;
}

if (uint8_t payload_type = buffer[1] & 0b0111'1111;
72 <= payload_type && payload_type <= 76) {
/*
* This is an RTCP payload. Discord is known to send
* RTCP Receiver Reports.
*
* See https://datatracker.ietf.org/doc/html/rfc3551#section-6
*/
return;
}
/* It's a "silence packet" - throw it away. */
if (packet_size < 44) {
return;
}

voice_payload vp{0, // seq, populate later
0, // timestamp, populate later
std::make_unique<voice_receive_t>(nullptr, std::string((char*)buffer, packet_size))};
if (uint8_t payload_type = buffer[1] & 0b0111'1111;
72 <= payload_type && payload_type <= 76) {
/*
* This is an RTCP payload. Discord is known to send
* RTCP Receiver Reports.
*
* See https://datatracker.ietf.org/doc/html/rfc3551#section-6
*/
return;
}

vp.vr->voice_client = this;
voice_payload vp{0, // seq, populate later
0, // timestamp, populate later
std::make_unique<voice_receive_t>(nullptr, std::string((char*)buffer, packet_size))};

{ /* Get the User ID of the speaker */
uint32_t speaker_ssrc;
std::memcpy(&speaker_ssrc, &buffer[8], sizeof(uint32_t));
speaker_ssrc = ntohl(speaker_ssrc);
vp.vr->user_id = ssrc_map[speaker_ssrc];
}
vp.vr->voice_client = this;

/* Get the sequence number of the voice UDP packet */
std::memcpy(&vp.seq, &buffer[2], sizeof(rtp_seq_t));
vp.seq = ntohs(vp.seq);
/* Get the timestamp of the voice UDP packet */
std::memcpy(&vp.timestamp, &buffer[4], sizeof(rtp_timestamp_t));
vp.timestamp = ntohl(vp.timestamp);

/* Nonce is the RTP Header with zero padding */
uint8_t nonce[24] = { 0 };
std::memcpy(nonce, &buffer[0], header_size);

/* Get the number of CSRC in header */
const size_t csrc_count = buffer[0] & 0b0000'1111;
/* Skip to the encrypted voice data */
const ptrdiff_t offset_to_data = header_size + sizeof(uint32_t) * csrc_count;
uint8_t* encrypted_data = buffer + offset_to_data;
const size_t encrypted_data_len = packet_size - offset_to_data;

if(crypto_aead_xchacha20poly1305_ietf_decrypt() != 0)

if (crypto_secretbox_open_easy(encrypted_data, encrypted_data,
encrypted_data_len, nonce, secret_key)) {
/* Invalid Discord RTP payload. */
return;
}
uint32_t speaker_ssrc;
{ /* Get the User ID of the speaker */
std::memcpy(&speaker_ssrc, &buffer[8], sizeof(uint32_t));
speaker_ssrc = ntohl(speaker_ssrc);
vp.vr->user_id = ssrc_map[speaker_ssrc];
}

const uint8_t* decrypted_data = encrypted_data;
size_t decrypted_data_len = encrypted_data_len - crypto_box_MACBYTES;
if ([[maybe_unused]] const bool uses_extension = (buffer[0] >> 4) & 0b0001) {
/* Skip the RTP Extensions */
size_t ext_len = 0;
{
uint16_t ext_len_in_words;
memcpy(&ext_len_in_words, &decrypted_data[2], sizeof(uint16_t));
ext_len_in_words = ntohs(ext_len_in_words);
ext_len = sizeof(uint32_t) * ext_len_in_words;
}
constexpr size_t ext_header_len = sizeof(uint16_t) * 2;
decrypted_data += ext_header_len + ext_len;
decrypted_data_len -= ext_header_len + ext_len;
/* Get the sequence number of the voice UDP packet */
std::memcpy(&vp.seq, &buffer[2], sizeof(rtp_seq_t));
vp.seq = ntohs(vp.seq);
/* Get the timestamp of the voice UDP packet */
std::memcpy(&vp.timestamp, &buffer[4], sizeof(rtp_timestamp_t));
vp.timestamp = ntohl(vp.timestamp);

constexpr size_t nonce_size = sizeof(uint32_t);
/* Nonce is 4 byte at the end of payload with zero padding */
uint8_t nonce[24] = { 0 };
std::memcpy(nonce, buffer + packet_size - nonce_size, nonce_size);

/* Get the number of CSRC in header */
const size_t csrc_count = buffer[0] & 0b0000'1111;
/* Skip to the encrypted voice data */
const ptrdiff_t offset_to_data = header_size + sizeof(uint32_t) * csrc_count;
size_t total_header_len = offset_to_data;

uint8_t* ciphertext = buffer + offset_to_data;
size_t ciphertext_len = packet_size - offset_to_data - nonce_size;

size_t ext_len = 0;
if ([[maybe_unused]] const bool uses_extension = (buffer[0] >> 4) & 0b0001) {
/**
* Get the RTP Extensions size, we only get the size here because
* the extension itself is encrypted along with the opus packet
*/
{
uint16_t ext_len_in_words;
memcpy(&ext_len_in_words, &ciphertext[2], sizeof(uint16_t));
ext_len_in_words = ntohs(ext_len_in_words);
ext_len = sizeof(uint32_t) * ext_len_in_words;
}
constexpr size_t ext_header_len = sizeof(uint16_t) * 2;
ciphertext += ext_header_len;
ciphertext_len -= ext_header_len;
total_header_len += ext_header_len;
}

/*
* We're left with the decrypted, opus-encoded data.
* Park the payload and decode on the voice courier thread.
uint8_t decrypted[65535] = { 0 };
unsigned long long opus_packet_len = 0;
if (crypto_aead_xchacha20poly1305_ietf_decrypt(
decrypted, &opus_packet_len,
nullptr,
ciphertext, ciphertext_len,
buffer,
/**
* Additional Data:
* The whole header (including csrc list) +
* 4 byte extension header (magic 0xBEDE + 16-bit denoting extension length)
*/
vp.vr->audio_data.assign(decrypted_data, decrypted_data + decrypted_data_len);
total_header_len,
nonce, secret_key) != 0) {
/* Invalid Discord RTP payload. */
return;
}

{
std::lock_guard lk(voice_courier_shared_state.mtx);
auto& [range, payload_queue, pending_decoder_ctls, decoder] = voice_courier_shared_state.parked_voice_payloads[vp.vr->user_id];
uint8_t *opus_packet = decrypted;
if (ext_len > 0) {
/* Skip previously encrypted RTP Header Extension */
opus_packet += ext_len;
opus_packet_len -= ext_len;
}

if (!decoder) {
/*
* Most likely this is the first time we encounter this speaker.
* Do some initialization for not only the decoder but also the range.
/*
* We're left with the decrypted, opus-encoded data.
* Park the payload and decode on the voice courier thread.
*/
vp.vr->audio_data.assign(opus_packet, opus_packet + opus_packet_len);

{
std::lock_guard lk(voice_courier_shared_state.mtx);
auto& [range, payload_queue, pending_decoder_ctls, decoder] = voice_courier_shared_state.parked_voice_payloads[vp.vr->user_id];

if (!decoder) {
/*
* Most likely this is the first time we encounter this speaker.
* Do some initialization for not only the decoder but also the range.
*/
range.min_seq = vp.seq;
range.min_timestamp = vp.timestamp;

int opus_error = 0;
decoder.reset(opus_decoder_create(opus_sample_rate_hz, opus_channel_count, &opus_error),
&opus_decoder_destroy);
if (opus_error) {
/**
* NOTE: The -10 here makes the opus_error match up with values of exception_error_code,
* which would otherwise conflict as every C library loves to use values from -1 downwards.
*/
range.min_seq = vp.seq;
range.min_timestamp = vp.timestamp;

int opus_error = 0;
decoder.reset(opus_decoder_create(opus_sample_rate_hz, opus_channel_count, &opus_error),
&opus_decoder_destroy);
if (opus_error) {
/**
* NOTE: The -10 here makes the opus_error match up with values of exception_error_code,
* which would otherwise conflict as every C library loves to use values from -1 downwards.
*/
throw dpp::voice_exception((exception_error_code)(opus_error - 10), "discord_voice_client::discord_voice_client; opus_decoder_create() failed");
}
throw dpp::voice_exception((exception_error_code)(opus_error - 10), "discord_voice_client::discord_voice_client; opus_decoder_create() failed");
}
}

if (vp.seq < range.min_seq && vp.timestamp < range.min_timestamp) {
/* This packet arrived too late. We can only discard it. */
return;
}
range.max_seq = vp.seq;
range.max_timestamp = vp.timestamp;
payload_queue.push(std::move(vp));
if (vp.seq < range.min_seq && vp.timestamp < range.min_timestamp) {
/* This packet arrived too late. We can only discard it. */
return;
}
range.max_seq = vp.seq;
range.max_timestamp = vp.timestamp;
payload_queue.push(std::move(vp));
}

voice_courier_shared_state.signal_iteration.notify_one();
voice_courier_shared_state.signal_iteration.notify_one();

if (!voice_courier.joinable()) {
/* Courier thread is not running, start it */
voice_courier = std::thread(&voice_courier_loop,
std::ref(*this),
std::ref(voice_courier_shared_state));
}
if (!voice_courier.joinable()) {
/* Courier thread is not running, start it */
voice_courier = std::thread(&voice_courier_loop,
std::ref(*this),
std::ref(voice_courier_shared_state));
}
#else
throw dpp::voice_exception(err_no_voice_support, "Voice support not enabled in this build of D++");
Expand Down Expand Up @@ -1244,13 +1275,13 @@ discord_voice_client& discord_voice_client::send_audio_raw(uint16_t* audio_data,
return send_audio_raw((uint16_t*)packet.data(), packet.size());
}

opus_int32 encodedAudioMaxLength = (opus_int32)length;
std::vector<uint8_t> encodedAudioData(encodedAudioMaxLength);
size_t encodedAudioLength = encodedAudioMaxLength;
opus_int32 encoded_audio_max_length = (opus_int32)length;
std::vector<uint8_t> encoded_audio(encoded_audio_max_length);
size_t encoded_audio_length = encoded_audio_max_length;

encodedAudioLength = this->encode((uint8_t*)audio_data, length, encodedAudioData.data(), encodedAudioLength);
encoded_audio_length = this->encode((uint8_t*)audio_data, length, encoded_audio.data(), encoded_audio_length);

send_audio_opus(encodedAudioData.data(), encodedAudioLength);
send_audio_opus(encoded_audio.data(), encoded_audio_length);
#else
throw dpp::voice_exception(err_no_voice_support, "Voice support not enabled in this build of D++");
#endif
Expand All @@ -1270,31 +1301,54 @@ discord_voice_client& discord_voice_client::send_audio_opus(uint8_t* opus_packet

discord_voice_client& discord_voice_client::send_audio_opus(uint8_t* opus_packet, const size_t length, uint64_t duration) {
#if HAVE_VOICE
int frameSize = (int)(48 * duration * (timescale / 1000000));
opus_int32 encodedAudioMaxLength = (opus_int32)length;
std::vector<uint8_t> encodedAudioData(encodedAudioMaxLength);
size_t encodedAudioLength = encodedAudioMaxLength;
int frame_size = (int)(48 * duration * (timescale / 1000000));
opus_int32 encoded_audio_max_length = (opus_int32)length;
std::vector<uint8_t> encoded_audio(encoded_audio_max_length);
size_t encoded_audio_length = encoded_audio_max_length;

encodedAudioLength = length;
encodedAudioData.reserve(length);
memcpy(encodedAudioData.data(), opus_packet, length);
encoded_audio_length = length;
encoded_audio.reserve(length);
memcpy(encoded_audio.data(), opus_packet, length);

++sequence;
rtp_header header(sequence, timestamp, (uint32_t)ssrc);

std::vector<uint8_t> audioDataPacket(sizeof(header) + encodedAudioLength + crypto_secretbox_MACBYTES);
std::memcpy(audioDataPacket.data(), &header, sizeof(header));
/* Expected payload size is unencrypted header + encrypted opus packet + unencrypted 32 bit nonce */
size_t packet_siz = sizeof(header) + (encoded_audio_length + crypto_aead_xchacha20poly1305_IETF_ABYTES) + sizeof(packet_nonce);

std::vector<uint8_t> payload(packet_siz);

/* Set RTP header */
std::memcpy(payload.data(), &header, sizeof(header));

/* Convert nonce to big-endian */
uint32_t noncel = htonl(packet_nonce);

/* 24 byte is needed for encrypting, discord just want 4 byte so just fill up the rest with null */
unsigned char encrypt_nonce[crypto_aead_xchacha20poly1305_ietf_NPUBBYTES] = { '\0' };
memcpy(encrypt_nonce, &noncel, sizeof(noncel));

unsigned char nonce[crypto_aead_xchacha20poly1305_ietf_NPUBBYTES];
randombytes_buf(nonce, sizeof nonce);
/* Execute */
crypto_aead_xchacha20poly1305_ietf_encrypt(
payload.data() + sizeof(header),
nullptr,
encoded_audio.data(),
encoded_audio_length,
/* The RTP Header as Additional Data */
reinterpret_cast<const unsigned char *>(&header),
sizeof(header),
nullptr,
static_cast<const unsigned char*>(encrypt_nonce),
secret_key);

unsigned long long clen_p;
crypto_aead_xchacha20poly1305_ietf_encrypt(audioDataPacket.data() + sizeof(header), &clen_p, encodedAudioData.data(), encodedAudioLength, NULL, NULL, NULL, (const unsigned char*)nonce, secret_key);
/* Append the 4 byte nonce to the resulting payload */
std::memcpy(payload.data() + payload.size() - sizeof(noncel), &noncel, sizeof(noncel));

//crypto_secretbox_easy(audioDataPacket.data() + sizeof(header), encodedAudioData.data(), encodedAudioLength, (const unsigned char*)nonce, secret_key);
this->send(reinterpret_cast<const char*>(payload.data()), payload.size(), duration);
timestamp += frame_size;

this->send((const char*)audioDataPacket.data(), audioDataPacket.size(), duration);
timestamp += frameSize;
/* Increment for next packet */
packet_nonce++;

speak();
#else
Expand Down

0 comments on commit a797675

Please sign in to comment.