Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: XChaCha20 voice encryption #1242

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions include/dpp/discordvoiceclient.h
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,17 @@ class DPP_EXPORT discord_voice_client : public websocket_client
*/
uint32_t timestamp;

/**
* @brief Each packet should have a nonce, a 32-bit incremental
* integer value appended to payload.
*
* We should keep track of this value and increment it for each
* packet sent.
*
* Current initial value is hardcoded to 1.
*/
uint32_t packet_nonce;

/**
* @brief Last sent packet high-resolution timestamp
*/
Expand Down
310 changes: 182 additions & 128 deletions src/dpp/discordvoiceclient.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
*
************************************************************************************/

#include <cstdint>
#include <dpp/export.h>
#ifdef _WIN32
#include <WinSock2.h>
Expand Down Expand Up @@ -316,6 +317,7 @@ discord_voice_client::discord_voice_client(dpp::cluster* _cluster, snowflake _ch
secret_key(nullptr),
sequence(0),
timestamp(0),
packet_nonce(1),
last_timestamp(std::chrono::high_resolution_clock::now()),
sending(false),
tracks(0),
Expand Down Expand Up @@ -593,6 +595,9 @@ bool discord_voice_client::handle_frame(const std::string &data)
rdy.voice_channel_id = this->channel_id;
creator->on_voice_ready.call(rdy);
}

/* Reset packet_nonce */
packet_nonce = 1;
}
break;
/* Voice ready */
Expand Down Expand Up @@ -711,131 +716,157 @@ void discord_voice_client::read_ready()
uint8_t buffer[65535];
int packet_size = this->udp_recv((char*)buffer, sizeof(buffer));

if (packet_size > 0 && (!creator->on_voice_receive.empty() || !creator->on_voice_receive_combined.empty())) {
constexpr size_t header_size = 12;
if (static_cast<size_t>(packet_size) < header_size) {
/* Invalid RTP payload */
return;
}
bool receive_handler_is_empty = creator->on_voice_receive.empty() && creator->on_voice_receive_combined.empty();
if (packet_size <= 0 || receive_handler_is_empty) {
/* Nothing to do */
return;
}

/* It's a "silence packet" - throw it away. */
if (packet_size < 44) {
return;
}
constexpr size_t header_size = 12;
if (static_cast<size_t>(packet_size) < header_size) {
/* Invalid RTP payload */
return;
}

if (uint8_t payload_type = buffer[1] & 0b0111'1111;
72 <= payload_type && payload_type <= 76) {
/*
* This is an RTCP payload. Discord is known to send
* RTCP Receiver Reports.
*
* See https://datatracker.ietf.org/doc/html/rfc3551#section-6
*/
return;
}
/* It's a "silence packet" - throw it away. */
if (packet_size < 44) {
return;
}

voice_payload vp{0, // seq, populate later
0, // timestamp, populate later
std::make_unique<voice_receive_t>(nullptr, std::string((char*)buffer, packet_size))};
if (uint8_t payload_type = buffer[1] & 0b0111'1111;
72 <= payload_type && payload_type <= 76) {
/*
* This is an RTCP payload. Discord is known to send
* RTCP Receiver Reports.
*
* See https://datatracker.ietf.org/doc/html/rfc3551#section-6
*/
return;
}

vp.vr->voice_client = this;
voice_payload vp{0, // seq, populate later
0, // timestamp, populate later
std::make_unique<voice_receive_t>(nullptr, std::string((char*)buffer, packet_size))};

{ /* Get the User ID of the speaker */
uint32_t speaker_ssrc;
std::memcpy(&speaker_ssrc, &buffer[8], sizeof(uint32_t));
speaker_ssrc = ntohl(speaker_ssrc);
vp.vr->user_id = ssrc_map[speaker_ssrc];
}
vp.vr->voice_client = this;

/* Get the sequence number of the voice UDP packet */
std::memcpy(&vp.seq, &buffer[2], sizeof(rtp_seq_t));
vp.seq = ntohs(vp.seq);
/* Get the timestamp of the voice UDP packet */
std::memcpy(&vp.timestamp, &buffer[4], sizeof(rtp_timestamp_t));
vp.timestamp = ntohl(vp.timestamp);

/* Nonce is the RTP Header with zero padding */
uint8_t nonce[24] = { 0 };
std::memcpy(nonce, &buffer[0], header_size);

/* Get the number of CSRC in header */
const size_t csrc_count = buffer[0] & 0b0000'1111;
/* Skip to the encrypted voice data */
const ptrdiff_t offset_to_data = header_size + sizeof(uint32_t) * csrc_count;
uint8_t* encrypted_data = buffer + offset_to_data;
const size_t encrypted_data_len = packet_size - offset_to_data;

if(crypto_aead_xchacha20poly1305_ietf_decrypt() != 0)

if (crypto_secretbox_open_easy(encrypted_data, encrypted_data,
encrypted_data_len, nonce, secret_key)) {
/* Invalid Discord RTP payload. */
return;
}
uint32_t speaker_ssrc;
{ /* Get the User ID of the speaker */
std::memcpy(&speaker_ssrc, &buffer[8], sizeof(uint32_t));
speaker_ssrc = ntohl(speaker_ssrc);
vp.vr->user_id = ssrc_map[speaker_ssrc];
}

const uint8_t* decrypted_data = encrypted_data;
size_t decrypted_data_len = encrypted_data_len - crypto_box_MACBYTES;
if ([[maybe_unused]] const bool uses_extension = (buffer[0] >> 4) & 0b0001) {
/* Skip the RTP Extensions */
size_t ext_len = 0;
{
uint16_t ext_len_in_words;
memcpy(&ext_len_in_words, &decrypted_data[2], sizeof(uint16_t));
ext_len_in_words = ntohs(ext_len_in_words);
ext_len = sizeof(uint32_t) * ext_len_in_words;
}
constexpr size_t ext_header_len = sizeof(uint16_t) * 2;
decrypted_data += ext_header_len + ext_len;
decrypted_data_len -= ext_header_len + ext_len;
/* Get the sequence number of the voice UDP packet */
std::memcpy(&vp.seq, &buffer[2], sizeof(rtp_seq_t));
vp.seq = ntohs(vp.seq);
/* Get the timestamp of the voice UDP packet */
std::memcpy(&vp.timestamp, &buffer[4], sizeof(rtp_timestamp_t));
vp.timestamp = ntohl(vp.timestamp);

constexpr size_t nonce_size = sizeof(uint32_t);
/* Nonce is 4 byte at the end of payload with zero padding */
uint8_t nonce[24] = { 0 };
std::memcpy(nonce, buffer + packet_size - nonce_size, nonce_size);

/* Get the number of CSRC in header */
const size_t csrc_count = buffer[0] & 0b0000'1111;
/* Skip to the encrypted voice data */
const ptrdiff_t offset_to_data = header_size + sizeof(uint32_t) * csrc_count;
size_t total_header_len = offset_to_data;

uint8_t* ciphertext = buffer + offset_to_data;
size_t ciphertext_len = packet_size - offset_to_data - nonce_size;

size_t ext_len = 0;
if ([[maybe_unused]] const bool uses_extension = (buffer[0] >> 4) & 0b0001) {
/**
* Get the RTP Extensions size, we only get the size here because
* the extension itself is encrypted along with the opus packet
*/
{
uint16_t ext_len_in_words;
memcpy(&ext_len_in_words, &ciphertext[2], sizeof(uint16_t));
ext_len_in_words = ntohs(ext_len_in_words);
ext_len = sizeof(uint32_t) * ext_len_in_words;
}
constexpr size_t ext_header_len = sizeof(uint16_t) * 2;
ciphertext += ext_header_len;
ciphertext_len -= ext_header_len;
total_header_len += ext_header_len;
}

/*
* We're left with the decrypted, opus-encoded data.
* Park the payload and decode on the voice courier thread.
uint8_t decrypted[65535] = { 0 };
unsigned long long opus_packet_len = 0;
if (crypto_aead_xchacha20poly1305_ietf_decrypt(
decrypted, &opus_packet_len,
nullptr,
ciphertext, ciphertext_len,
buffer,
/**
* Additional Data:
* The whole header (including csrc list) +
* 4 byte extension header (magic 0xBEDE + 16-bit denoting extension length)
*/
vp.vr->audio_data.assign(decrypted_data, decrypted_data + decrypted_data_len);
total_header_len,
nonce, secret_key) != 0) {
/* Invalid Discord RTP payload. */
return;
}

{
std::lock_guard lk(voice_courier_shared_state.mtx);
auto& [range, payload_queue, pending_decoder_ctls, decoder] = voice_courier_shared_state.parked_voice_payloads[vp.vr->user_id];
uint8_t *opus_packet = decrypted;
if (ext_len > 0) {
/* Skip previously encrypted RTP Header Extension */
opus_packet += ext_len;
opus_packet_len -= ext_len;
}

if (!decoder) {
/*
* Most likely this is the first time we encounter this speaker.
* Do some initialization for not only the decoder but also the range.
/*
* We're left with the decrypted, opus-encoded data.
* Park the payload and decode on the voice courier thread.
*/
vp.vr->audio_data.assign(opus_packet, opus_packet + opus_packet_len);

{
std::lock_guard lk(voice_courier_shared_state.mtx);
auto& [range, payload_queue, pending_decoder_ctls, decoder] = voice_courier_shared_state.parked_voice_payloads[vp.vr->user_id];

if (!decoder) {
/*
* Most likely this is the first time we encounter this speaker.
* Do some initialization for not only the decoder but also the range.
*/
range.min_seq = vp.seq;
range.min_timestamp = vp.timestamp;

int opus_error = 0;
decoder.reset(opus_decoder_create(opus_sample_rate_hz, opus_channel_count, &opus_error),
&opus_decoder_destroy);
if (opus_error) {
/**
* NOTE: The -10 here makes the opus_error match up with values of exception_error_code,
* which would otherwise conflict as every C library loves to use values from -1 downwards.
*/
range.min_seq = vp.seq;
range.min_timestamp = vp.timestamp;

int opus_error = 0;
decoder.reset(opus_decoder_create(opus_sample_rate_hz, opus_channel_count, &opus_error),
&opus_decoder_destroy);
if (opus_error) {
/**
* NOTE: The -10 here makes the opus_error match up with values of exception_error_code,
* which would otherwise conflict as every C library loves to use values from -1 downwards.
*/
throw dpp::voice_exception((exception_error_code)(opus_error - 10), "discord_voice_client::discord_voice_client; opus_decoder_create() failed");
}
throw dpp::voice_exception((exception_error_code)(opus_error - 10), "discord_voice_client::discord_voice_client; opus_decoder_create() failed");
}
}

if (vp.seq < range.min_seq && vp.timestamp < range.min_timestamp) {
/* This packet arrived too late. We can only discard it. */
return;
}
range.max_seq = vp.seq;
range.max_timestamp = vp.timestamp;
payload_queue.push(std::move(vp));
if (vp.seq < range.min_seq && vp.timestamp < range.min_timestamp) {
/* This packet arrived too late. We can only discard it. */
return;
}
range.max_seq = vp.seq;
range.max_timestamp = vp.timestamp;
payload_queue.push(std::move(vp));
}

voice_courier_shared_state.signal_iteration.notify_one();
voice_courier_shared_state.signal_iteration.notify_one();

if (!voice_courier.joinable()) {
/* Courier thread is not running, start it */
voice_courier = std::thread(&voice_courier_loop,
std::ref(*this),
std::ref(voice_courier_shared_state));
}
if (!voice_courier.joinable()) {
/* Courier thread is not running, start it */
voice_courier = std::thread(&voice_courier_loop,
std::ref(*this),
std::ref(voice_courier_shared_state));
}
#else
throw dpp::voice_exception(err_no_voice_support, "Voice support not enabled in this build of D++");
Expand Down Expand Up @@ -1244,13 +1275,13 @@ discord_voice_client& discord_voice_client::send_audio_raw(uint16_t* audio_data,
return send_audio_raw((uint16_t*)packet.data(), packet.size());
}

opus_int32 encodedAudioMaxLength = (opus_int32)length;
std::vector<uint8_t> encodedAudioData(encodedAudioMaxLength);
size_t encodedAudioLength = encodedAudioMaxLength;
opus_int32 encoded_audio_max_length = (opus_int32)length;
std::vector<uint8_t> encoded_audio(encoded_audio_max_length);
size_t encoded_audio_length = encoded_audio_max_length;

encodedAudioLength = this->encode((uint8_t*)audio_data, length, encodedAudioData.data(), encodedAudioLength);
encoded_audio_length = this->encode((uint8_t*)audio_data, length, encoded_audio.data(), encoded_audio_length);

send_audio_opus(encodedAudioData.data(), encodedAudioLength);
send_audio_opus(encoded_audio.data(), encoded_audio_length);
#else
throw dpp::voice_exception(err_no_voice_support, "Voice support not enabled in this build of D++");
#endif
Expand All @@ -1270,31 +1301,54 @@ discord_voice_client& discord_voice_client::send_audio_opus(uint8_t* opus_packet

discord_voice_client& discord_voice_client::send_audio_opus(uint8_t* opus_packet, const size_t length, uint64_t duration) {
#if HAVE_VOICE
int frameSize = (int)(48 * duration * (timescale / 1000000));
opus_int32 encodedAudioMaxLength = (opus_int32)length;
std::vector<uint8_t> encodedAudioData(encodedAudioMaxLength);
size_t encodedAudioLength = encodedAudioMaxLength;
int frame_size = (int)(48 * duration * (timescale / 1000000));
opus_int32 encoded_audio_max_length = (opus_int32)length;
std::vector<uint8_t> encoded_audio(encoded_audio_max_length);
size_t encoded_audio_length = encoded_audio_max_length;

encodedAudioLength = length;
encodedAudioData.reserve(length);
memcpy(encodedAudioData.data(), opus_packet, length);
encoded_audio_length = length;
encoded_audio.reserve(length);
memcpy(encoded_audio.data(), opus_packet, length);

++sequence;
rtp_header header(sequence, timestamp, (uint32_t)ssrc);

std::vector<uint8_t> audioDataPacket(sizeof(header) + encodedAudioLength + crypto_secretbox_MACBYTES);
std::memcpy(audioDataPacket.data(), &header, sizeof(header));
/* Expected payload size is unencrypted header + encrypted opus packet + unencrypted 32 bit nonce */
size_t packet_siz = sizeof(header) + (encoded_audio_length + crypto_aead_xchacha20poly1305_IETF_ABYTES) + sizeof(packet_nonce);

std::vector<uint8_t> payload(packet_siz);

/* Set RTP header */
std::memcpy(payload.data(), &header, sizeof(header));

/* Convert nonce to big-endian */
uint32_t noncel = htonl(packet_nonce);

/* 24 byte is needed for encrypting, discord just want 4 byte so just fill up the rest with null */
unsigned char encrypt_nonce[crypto_aead_xchacha20poly1305_ietf_NPUBBYTES] = { '\0' };
memcpy(encrypt_nonce, &noncel, sizeof(noncel));

unsigned char nonce[crypto_aead_xchacha20poly1305_ietf_NPUBBYTES];
randombytes_buf(nonce, sizeof nonce);
/* Execute */
crypto_aead_xchacha20poly1305_ietf_encrypt(
payload.data() + sizeof(header),
nullptr,
encoded_audio.data(),
encoded_audio_length,
/* The RTP Header as Additional Data */
reinterpret_cast<const unsigned char *>(&header),
sizeof(header),
nullptr,
static_cast<const unsigned char*>(encrypt_nonce),
secret_key);

unsigned long long clen_p;
crypto_aead_xchacha20poly1305_ietf_encrypt(audioDataPacket.data() + sizeof(header), &clen_p, encodedAudioData.data(), encodedAudioLength, NULL, NULL, NULL, (const unsigned char*)nonce, secret_key);
/* Append the 4 byte nonce to the resulting payload */
std::memcpy(payload.data() + payload.size() - sizeof(noncel), &noncel, sizeof(noncel));

//crypto_secretbox_easy(audioDataPacket.data() + sizeof(header), encodedAudioData.data(), encodedAudioLength, (const unsigned char*)nonce, secret_key);
this->send(reinterpret_cast<const char*>(payload.data()), payload.size(), duration);
timestamp += frame_size;

this->send((const char*)audioDataPacket.data(), audioDataPacket.size(), duration);
timestamp += frameSize;
/* Increment for next packet */
packet_nonce++;

speak();
#else
Expand Down
Loading