Merge pull request #3 from deepgram/lo/fix-to-respect-dg-speakers

lukeocodes · web-flow · commit 2c23c6bbb04d · 2023-11-07T08:29:38.000Z
feat: support speaker labels
diff --git a/src/converters/DeepgramConverter.ts b/src/converters/DeepgramConverter.ts
@@ -18,7 +18,31 @@ export class DeepgramConverter implements IConverter {
         }
       });
     } else {
-      content.push(...chunkArray(results.channels[0].alternatives[0].words, lineLength));
+      const words = results.channels[0].alternatives[0].words;
+      const diarize = "speaker" in words[0]; // was diarization used
+
+      let buffer: WordBase[] = [];
+      let currentSpeaker = 0;
+
+      words.forEach((word) => {
+        if (diarize && word.speaker !== currentSpeaker) {
+          content.push(buffer);
+          buffer = [];
+        }
+
+        if (buffer.length === lineLength) {
+          content.push(buffer);
+          buffer = [];
+        }
+
+        if (diarize) {
+          currentSpeaker = word.speaker ?? 0;
+        }
+
+        buffer.push(word);
+      });
+
+      content.push(buffer);
     }
 
     return content;
diff --git a/src/index.ts b/src/index.ts
@@ -1,5 +1,5 @@
 import { DeepgramConverter, IConverter, isConverter } from "./converters";
-import { secondsToTimestamp } from "./lib/helpers";
+import { chunkArray, secondsToTimestamp } from "./lib/helpers";
 import { DeepgramResponse } from "./lib/types";
 
 const parseInput = (transcription: any): IConverter => {
@@ -26,12 +26,19 @@ const webvtt = (transcription: any, lineLength: number = 8): string => {
   // get the lines
   const lines = data.getLines(lineLength);
 
+  // is speaker output required?
+  const speakerLabels = "speaker" in lines[0][0];
+
   lines.forEach((words) => {
     const firstWord = words[0];
     const lastWord = words[words.length - 1];
 
     output.push(`${secondsToTimestamp(firstWord.start)} --> ${secondsToTimestamp(lastWord.end)}`);
-    output.push(words.map((word) => word.punctuated_word ?? word.word).join(" "));
+
+    const line = words.map((word) => word.punctuated_word ?? word.word).join(" ");
+    const speakerLabel = speakerLabels ? `<v Speaker ${firstWord.speaker}>` : "";
+
+    output.push(`${speakerLabel}${line}`);
     output.push("");
   });
 
@@ -41,9 +48,16 @@ const webvtt = (transcription: any, lineLength: number = 8): string => {
 const srt = (transcription: any, lineLength: number = 8): string => {
   const output: string[] = [];
 
+  const data = parseInput(transcription);
+
   // get the lines
-  const lines = parseInput(transcription).getLines(lineLength);
+  let lines = data.getLines(lineLength);
+
+  // is speaker output required?
+  const speakerLabels = "speaker" in lines[0][0];
+
   let entry = 1;
+  let currentSpeaker: any;
 
   lines.forEach((words) => {
     output.push((entry++).toString());
@@ -57,8 +71,17 @@ const srt = (transcription: any, lineLength: number = 8): string => {
         "HH:mm:ss,SSS"
       )}`
     );
-    output.push(words.map((word) => word.punctuated_word ?? word.word).join(" "));
+
+    const line = words.map((word) => word.punctuated_word ?? word.word).join(" ");
+    const speakerLabel =
+      speakerLabels && currentSpeaker !== firstWord.speaker
+        ? `[Speaker ${firstWord.speaker}]\n`
+        : "";
+
+    output.push(`${speakerLabel}${line}`);
     output.push("");
+
+    currentSpeaker = firstWord.speaker;
   });
 
   return output.join("\n");
diff --git a/src/lib/types.ts b/src/lib/types.ts
@@ -130,7 +130,7 @@ export interface WordBase {
   word: string;
   start: number;
   end: number;
-  confidence: number;
+  confidence?: number;
   punctuated_word?: string;
   speaker?: number;
   speaker_confidence?: number;
diff --git a/test/deepgram.test.ts b/test/deepgram.test.ts
@@ -1,5 +1,6 @@
 import dg_transcription from "./dg-transcription.json";
 import dg_utterances from "./dg-transcription.json";
+import dg_speakers from "./dg-speakers-no-utterance.json";
 import { webvtt, srt, DeepgramConverter } from "../src/index";
 import { expect } from "chai";
 import srtValidator from "srt-validator";
@@ -76,4 +77,26 @@ describe("testing deepgram transcription formatting", () => {
 
     expect(srtValidator(result)).to.deep.equal([]);
   });
+
+  it("should return a valid webvtt format with speaker labels when provided a deepgram transcription", () => {
+    const result = webvtt(new DeepgramConverter(dg_speakers));
+
+    expect(typeof result).to.equal("string");
+    expect(result).to.have.string("NOTE");
+    expect(result).to.have.string("Transcription provided by Deepgram");
+    expect(result).to.have.string("Request Id");
+    expect(result).to.have.string("Created");
+    expect(result).to.have.string("Duration");
+    expect(result).to.have.string("Channels");
+    expect(result).to.have.string("Channels");
+    expect(result).to.have.string("<v Speaker 0>");
+    expect(result).to.have.string("<v Speaker 1>");
+    expect(result).to.have.string("<v Speaker 2>");
+  });
+
+  it("should return a valid srt format with speaker labels when provided a deepgram transcription", () => {
+    const result = srt(new DeepgramConverter(dg_speakers));
+
+    expect(srtValidator(result)).to.deep.equal([]);
+  });
 });
diff --git a/test/dg-speakers-no-utterance.json b/test/dg-speakers-no-utterance.json
diff --git a/test/dg-speakers.json b/test/dg-speakers.json