Skip to content

Commit 2c23c6b

Browse files
authored
Merge pull request #3 from deepgram/lo/fix-to-respect-dg-speakers
feat: support speaker labels
2 parents 1b34161 + eec80b3 commit 2c23c6b

6 files changed

+6952
-6
lines changed

src/converters/DeepgramConverter.ts

+25-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,31 @@ export class DeepgramConverter implements IConverter {
1818
}
1919
});
2020
} else {
21-
content.push(...chunkArray(results.channels[0].alternatives[0].words, lineLength));
21+
const words = results.channels[0].alternatives[0].words;
22+
const diarize = "speaker" in words[0]; // was diarization used
23+
24+
let buffer: WordBase[] = [];
25+
let currentSpeaker = 0;
26+
27+
words.forEach((word) => {
28+
if (diarize && word.speaker !== currentSpeaker) {
29+
content.push(buffer);
30+
buffer = [];
31+
}
32+
33+
if (buffer.length === lineLength) {
34+
content.push(buffer);
35+
buffer = [];
36+
}
37+
38+
if (diarize) {
39+
currentSpeaker = word.speaker ?? 0;
40+
}
41+
42+
buffer.push(word);
43+
});
44+
45+
content.push(buffer);
2246
}
2347

2448
return content;

src/index.ts

+27-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { DeepgramConverter, IConverter, isConverter } from "./converters";
2-
import { secondsToTimestamp } from "./lib/helpers";
2+
import { chunkArray, secondsToTimestamp } from "./lib/helpers";
33
import { DeepgramResponse } from "./lib/types";
44

55
const parseInput = (transcription: any): IConverter => {
@@ -26,12 +26,19 @@ const webvtt = (transcription: any, lineLength: number = 8): string => {
2626
// get the lines
2727
const lines = data.getLines(lineLength);
2828

29+
// is speaker output required?
30+
const speakerLabels = "speaker" in lines[0][0];
31+
2932
lines.forEach((words) => {
3033
const firstWord = words[0];
3134
const lastWord = words[words.length - 1];
3235

3336
output.push(`${secondsToTimestamp(firstWord.start)} --> ${secondsToTimestamp(lastWord.end)}`);
34-
output.push(words.map((word) => word.punctuated_word ?? word.word).join(" "));
37+
38+
const line = words.map((word) => word.punctuated_word ?? word.word).join(" ");
39+
const speakerLabel = speakerLabels ? `<v Speaker ${firstWord.speaker}>` : "";
40+
41+
output.push(`${speakerLabel}${line}`);
3542
output.push("");
3643
});
3744

@@ -41,9 +48,16 @@ const webvtt = (transcription: any, lineLength: number = 8): string => {
4148
const srt = (transcription: any, lineLength: number = 8): string => {
4249
const output: string[] = [];
4350

51+
const data = parseInput(transcription);
52+
4453
// get the lines
45-
const lines = parseInput(transcription).getLines(lineLength);
54+
let lines = data.getLines(lineLength);
55+
56+
// is speaker output required?
57+
const speakerLabels = "speaker" in lines[0][0];
58+
4659
let entry = 1;
60+
let currentSpeaker: any;
4761

4862
lines.forEach((words) => {
4963
output.push((entry++).toString());
@@ -57,8 +71,17 @@ const srt = (transcription: any, lineLength: number = 8): string => {
5771
"HH:mm:ss,SSS"
5872
)}`
5973
);
60-
output.push(words.map((word) => word.punctuated_word ?? word.word).join(" "));
74+
75+
const line = words.map((word) => word.punctuated_word ?? word.word).join(" ");
76+
const speakerLabel =
77+
speakerLabels && currentSpeaker !== firstWord.speaker
78+
? `[Speaker ${firstWord.speaker}]\n`
79+
: "";
80+
81+
output.push(`${speakerLabel}${line}`);
6182
output.push("");
83+
84+
currentSpeaker = firstWord.speaker;
6285
});
6386

6487
return output.join("\n");

src/lib/types.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ export interface WordBase {
130130
word: string;
131131
start: number;
132132
end: number;
133-
confidence: number;
133+
confidence?: number;
134134
punctuated_word?: string;
135135
speaker?: number;
136136
speaker_confidence?: number;

test/deepgram.test.ts

+23
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import dg_transcription from "./dg-transcription.json";
22
import dg_utterances from "./dg-transcription.json";
3+
import dg_speakers from "./dg-speakers-no-utterance.json";
34
import { webvtt, srt, DeepgramConverter } from "../src/index";
45
import { expect } from "chai";
56
import srtValidator from "srt-validator";
@@ -76,4 +77,26 @@ describe("testing deepgram transcription formatting", () => {
7677

7778
expect(srtValidator(result)).to.deep.equal([]);
7879
});
80+
81+
it("should return a valid webvtt format with speaker labels when provided a deepgram transcription", () => {
82+
const result = webvtt(new DeepgramConverter(dg_speakers));
83+
84+
expect(typeof result).to.equal("string");
85+
expect(result).to.have.string("NOTE");
86+
expect(result).to.have.string("Transcription provided by Deepgram");
87+
expect(result).to.have.string("Request Id");
88+
expect(result).to.have.string("Created");
89+
expect(result).to.have.string("Duration");
90+
expect(result).to.have.string("Channels");
91+
expect(result).to.have.string("Channels");
92+
expect(result).to.have.string("<v Speaker 0>");
93+
expect(result).to.have.string("<v Speaker 1>");
94+
expect(result).to.have.string("<v Speaker 2>");
95+
});
96+
97+
it("should return a valid srt format with speaker labels when provided a deepgram transcription", () => {
98+
const result = srt(new DeepgramConverter(dg_speakers));
99+
100+
expect(srtValidator(result)).to.deep.equal([]);
101+
});
79102
});

0 commit comments

Comments
 (0)