Custom Transcriber

Introduction

Vapi supports several transcription providers, but sometimes you may need to use your own transcription service. This guide shows you how to integrate Deepgram as your custom transcriber. The solution streams raw stereo PCM audio (16‑bit) from Vapi via WebSocket to your server, which then forwards the audio to Deepgram. Deepgram returns real‑time partial and final transcripts that are processed (including channel detection) and sent back to Vapi.

Why Use a Custom Transcriber?

Flexibility: Integrate with your preferred transcription service.
Control: Implement specialized processing that isn’t available with built‑in providers.
Cost Efficiency: Leverage your existing transcription infrastructure while maintaining full control over the pipeline.
Customization: Tailor the handling of audio data, transcript formatting, and buffering according to your specific needs.

How It Works

Connection Initialization:
Vapi connects to your custom transcriber endpoint (e.g. /api/custom-transcriber) via WebSocket. It sends an initial JSON message like this:

1 {
2   "type": "start",
3   "encoding": "linear16",
4   "container": "raw",
5   "sampleRate": 16000,
6   "channels": 2
7 }

Audio Streaming:
Vapi then streams binary PCM audio to your server.
Transcription Processing:
Your server forwards the audio to Deepgram(Chooseen Transcriber for Example) using its SDK. Deepgram processes the audio and returns transcript events that include a channel_index (e.g. [0, ...] for customer, [1, ...] for assistant). The service buffers the incoming data, processes the transcript events (with debouncing and channel detection), and emits a final transcript.

Response:
The final transcript is sent back to Vapi as a JSON message:

1 {
2   "type": "transcriber-response",
3   "transcription": "The transcribed text",
4   "channel": "customer" // or "assistant"
5 }

Implementation Steps

1. Project Setup

Create a new Node.js project and install the required dependencies:

$ mkdir vapi-custom-transcriber
> cd vapi-custom-transcriber
> npm init -y
> npm install ws express dotenv @deepgram/sdk

Create a .env file with the following content:

1 DEEPGRAM_API_KEY=your_deepgram_api_key
2 PORT=3001

2. Code Files

Below are the individual code files you need for the integration.

transcriptionService.js

This service creates a live connection to Deepgram, processes incoming audio, handles transcript events (including channel detection), and emits the final transcript back to the caller.

1 const { createClient, LiveTranscriptionEvents } = require("@deepgram/sdk");
2 const EventEmitter = require("events");
3 
4 const PUNCTUATION_TERMINATORS = [".", "!", "?"];
5 const MAX_RETRY_ATTEMPTS = 3;
6 const DEBOUNCE_DELAY_IN_SECS = 3;
7 const DEBOUNCE_DELAY = DEBOUNCE_DELAY_IN_SECS * 1000;
8 const DEEPGRAM_API_KEY = process.env["DEEPGRAM_API_KEY"] || "";
9 
10 class TranscriptionService extends EventEmitter {
11   constructor(config, logger) {
12     super();
13     this.config = config;
14     this.logger = logger;
15     this.flowLogger = require("./fileLogger").createNamedLogger(
16       "transcriber-flow.log"
17     );
18     if (!DEEPGRAM_API_KEY) {
19       throw new Error("Missing Deepgram API Key");
20     }
21     this.deepgramClient = createClient(DEEPGRAM_API_KEY);
22     this.logger.logDetailed(
23       "INFO",
24       "Initializing Deepgram live connection",
25       "TranscriptionService",
26       {
27         model: "nova-2",
28         sample_rate: 16000,
29         channels: 2,
30       }
31     );
32     this.deepgramLive = this.deepgramClient.listen.live({
33       encoding: "linear16",
34       channels: 2,
35       sample_rate: 16000,
36       model: "nova-2",
37       smart_format: true,
38       interim_results: true,
39       endpointing: 800,
40       language: "en",
41       multichannel: true,
42     });
43     this.finalResult = { customer: "", assistant: "" };
44     this.audioBuffer = [];
45     this.retryAttempts = 0;
46     this.lastTranscriptionTime = Date.now();
47     this.pcmBuffer = Buffer.alloc(0);
48 
49     this.deepgramLive.addListener(LiveTranscriptionEvents.Open, () => {
50       this.logger.logDetailed(
51         "INFO",
52         "Deepgram connection opened",
53         "TranscriptionService"
54       );
55       this.deepgramLive.on(LiveTranscriptionEvents.Close, () => {
56         this.logger.logDetailed(
57           "INFO",
58           "Deepgram connection closed",
59           "TranscriptionService"
60         );
61         this.emitTranscription();
62         this.audioBuffer = [];
63       });
64       this.deepgramLive.on(LiveTranscriptionEvents.Metadata, (data) => {
65         this.logger.logDetailed(
66           "DEBUG",
67           "Deepgram metadata received",
68           "TranscriptionService",
69           data
70         );
71       });
72       this.deepgramLive.on(LiveTranscriptionEvents.Transcript, (event) => {
73         this.handleTranscript(event);
74       });
75       this.deepgramLive.on(LiveTranscriptionEvents.Error, (err) => {
76         this.logger.logDetailed(
77           "ERROR",
78           "Deepgram error received",
79           "TranscriptionService",
80           { error: err }
81         );
82         this.emit("transcriptionerror", err);
83       });
84     });
85   }
86 
87   send(payload) {
88     if (payload instanceof Buffer) {
89       this.pcmBuffer =
90         this.pcmBuffer.length === 0
91           ? payload
92           : Buffer.concat([this.pcmBuffer, payload]);
93     } else {
94       this.logger.warn("TranscriptionService: Received non-Buffer data chunk.");
95     }
96     if (this.deepgramLive.getReadyState() === 1 && this.pcmBuffer.length > 0) {
97       this.sendBufferedData(this.pcmBuffer);
98       this.pcmBuffer = Buffer.alloc(0);
99     }
100   }
101 
102   sendBufferedData(bufferedData) {
103     try {
104       this.logger.logDetailed(
105         "INFO",
106         "Sending buffered data to Deepgram",
107         "TranscriptionService",
108         { bytes: bufferedData.length }
109       );
110       this.deepgramLive.send(bufferedData);
111       this.audioBuffer = [];
112       this.retryAttempts = 0;
113     } catch (error) {
114       this.logger.logDetailed(
115         "ERROR",
116         "Error sending buffered data",
117         "TranscriptionService",
118         { error }
119       );
120       this.retryAttempts++;
121       if (this.retryAttempts <= MAX_RETRY_ATTEMPTS) {
122         setTimeout(() => {
123           this.sendBufferedData(bufferedData);
124         }, 1000);
125       } else {
126         this.logger.logDetailed(
127           "ERROR",
128           "Max retry attempts reached, discarding data",
129           "TranscriptionService"
130         );
131         this.audioBuffer = [];
132         this.retryAttempts = 0;
133       }
134     }
135   }
136 
137   handleTranscript(transcription) {
138     if (!transcription.channel || !transcription.channel.alternatives?.[0]) {
139       this.logger.logDetailed(
140         "WARN",
141         "Invalid transcript format",
142         "TranscriptionService",
143         { transcription }
144       );
145       return;
146     }
147     const text = transcription.channel.alternatives[0].transcript.trim();
148     if (!text) return;
149     const currentTime = Date.now();
150     const channelIndex = transcription.channel_index
151       ? transcription.channel_index[0]
152       : 0;
153     const channel = channelIndex === 0 ? "customer" : "assistant";
154     this.logger.logDetailed(
155       "INFO",
156       "Received transcript",
157       "TranscriptionService",
158       { channel, text }
159     );
160     if (transcription.is_final || transcription.speech_final) {
161       this.finalResult[channel] += ` ${text}`;
162       this.emitTranscription();
163     } else {
164       this.finalResult[channel] += ` ${text}`;
165       if (currentTime - this.lastTranscriptionTime >= DEBOUNCE_DELAY) {
166         this.logger.logDetailed(
167           "INFO",
168           `Emitting transcript after ${DEBOUNCE_DELAY_IN_SECS}s inactivity`,
169           "TranscriptionService"
170         );
171         this.emitTranscription();
172       }
173     }
174     this.lastTranscriptionTime = currentTime;
175   }
176 
177   emitTranscription() {
178     for (const chan of ["customer", "assistant"]) {
179       if (this.finalResult[chan].trim()) {
180         const transcript = this.finalResult[chan].trim();
181         this.logger.logDetailed(
182           "INFO",
183           "Emitting transcription",
184           "TranscriptionService",
185           { channel: chan, transcript }
186         );
187         this.emit("transcription", transcript, chan);
188         this.finalResult[chan] = "";
189       }
190     }
191   }
192 }
193 
194 module.exports = TranscriptionService;

server.js

This file creates an Express server, attaches the custom transcriber WebSocket at /api/custom-transcriber, and starts the HTTP server.

1 const express = require("express");
2 const http = require("http");
3 const TranscriptionService = require("./transcriptionService");
4 const FileLogger = require("./fileLogger");
5 require("dotenv").config();
6 
7 const app = express();
8 app.use(express.json());
9 app.use(express.urlencoded({ extended: true }));
10 
11 app.get("/", (req, res) => {
12   res.send("Custom Transcriber Service is running");
13 });
14 
15 const server = http.createServer(app);
16 
17 const config = {
18   DEEPGRAM_API_KEY: process.env.DEEPGRAM_API_KEY,
19   PORT: process.env.PORT || 3001,
20 };
21 
22 const logger = new FileLogger();
23 const transcriptionService = new TranscriptionService(config, logger);
24 
25 transcriptionService.setupWebSocketServer = function (server) {
26   const WebSocketServer = require("ws").Server;
27   const wss = new WebSocketServer({ server, path: "/api/custom-transcriber" });
28   wss.on("connection", (ws) => {
29     logger.logDetailed(
30       "INFO",
31       "New WebSocket client connected on /api/custom-transcriber",
32       "Server"
33     );
34     ws.on("message", (data, isBinary) => {
35       if (!isBinary) {
36         try {
37           const msg = JSON.parse(data.toString());
38           if (msg.type === "start") {
39             logger.logDetailed(
40               "INFO",
41               "Received start message from client",
42               "Server",
43               { sampleRate: msg.sampleRate, channels: msg.channels }
44             );
45           }
46         } catch (err) {
47           logger.error("JSON parse error", err, "Server");
48         }
49       } else {
50         transcriptionService.send(data);
51       }
52     });
53     ws.on("close", () => {
54       logger.logDetailed("INFO", "WebSocket client disconnected", "Server");
55       if (
56         transcriptionService.deepgramLive &&
57         transcriptionService.deepgramLive.getReadyState() === 1
58       ) {
59         transcriptionService.deepgramLive.finish();
60       }
61     });
62     ws.on("error", (error) => {
63       logger.error("WebSocket error", error, "Server");
64     });
65     transcriptionService.on("transcription", (text, channel) => {
66       const response = {
67         type: "transcriber-response",
68         transcription: text,
69         channel,
70       };
71       ws.send(JSON.stringify(response));
72       logger.logDetailed("INFO", "Sent transcription to client", "Server", {
73         channel,
74         text,
75       });
76     });
77     transcriptionService.on("transcriptionerror", (err) => {
78       ws.send(
79         JSON.stringify({ type: "error", error: "Transcription service error" })
80       );
81       logger.error("Transcription service error", err, "Server");
82     });
83   });
84 };
85 
86 transcriptionService.setupWebSocketServer(server);
87 
88 server.listen(config.PORT, () => {
89   console.log(`Server is running on http://localhost:${config.PORT}`);
90 });

Testing Your Integration

Code Examples – How to Test

Deploy Your Server:
Run your server with:

$ node server.js

Expose Your Server:
If you want to test externally, use a tool like ngrok to expose your server via HTTPS/WSS.

Initiate a Call with Vapi:
Use the following CURL command (update the placeholders with your actual values):

$ curl -X POST https://api.vapi.ai/call \
>      -H "Authorization: Bearer YOUR_API_KEY" \
>      -H "Content-Type: application/json" \
>      -d '{
>   "phoneNumberId": "YOUR_PHONE_NUMBER_ID",
>   "customer": {
>     "number": "CUSTOMER_PHONE_NUMBER"
>   },
>   "assistant": {
>     "transcriber": {
>       "provider": "custom-transcriber",
>       "server": {
>         "url": "wss://your-server.ngrok.io/api/custom-transcriber"
>       },
>       "secret": "your_optional_secret_value"
>     },
>     "firstMessage": "Hello! I am using a custom transcriber with Deepgram."
>   },
>   "name": "CustomTranscriberTest"
> }'

Expected Behavior

Vapi connects via WebSocket to your custom transcriber at /api/custom-transcriber.
The "start" message initializes the Deepgram session.
PCM audio data is forwarded to Deepgram.
Deepgram returns transcript events, which are processed with channel detection and debouncing.

The final transcript is sent back as a JSON message:

1 {
2   "type": "transcriber-response",
3   "transcription": "The transcribed text",
4   "channel": "customer" // or "assistant"
5 }

Notes and Limitations

Streaming Support Requirement:
The custom transcriber must support streaming. Vapi sends continuous audio data over the WebSocket, and your server must handle this stream in real time.
Secret Header:
The custom transcriber configuration accepts an optional field called secret. When set, Vapi will send this value with every request as an HTTP header named x-vapi-secret. This can also be configured via a headers field.
Buffering:
The solution buffers PCM audio and performs simple validation (e.g. ensuring stereo PCM data length is a multiple of 4). If the audio data is malformed, it is trimmed to a valid length.
Channel Detection:
Transcript events from Deepgram include a channel_index array. The service uses the first element to determine whether the transcript is from the customer (0) or the assistant (1). Ensure Deepgram’s response format remains consistent with this logic.

Conclusion

Using a custom transcriber with Vapi gives you the flexibility to integrate any transcription service into your call flows. This guide walked you through the setup, usage, and testing of a solution that streams real-time audio, processes transcripts with multi‑channel detection, and returns formatted responses back to Vapi. Follow the steps above and use the provided code examples to build your custom transcriber solution.

1	{
2	"type": "start",
3	"encoding": "linear16",
4	"container": "raw",
5	"sampleRate": 16000,
6	"channels": 2
7	}

1	{
2	"type": "transcriber-response",
3	"transcription": "The transcribed text",
4	"channel": "customer" // or "assistant"
5	}

$	mkdir vapi-custom-transcriber
>	cd vapi-custom-transcriber
>	npm init -y
>	npm install ws express dotenv @deepgram/sdk

1	const { createClient, LiveTranscriptionEvents } = require("@deepgram/sdk");
2	const EventEmitter = require("events");
3
4	const PUNCTUATION_TERMINATORS = [".", "!", "?"];
5	const MAX_RETRY_ATTEMPTS = 3;
6	const DEBOUNCE_DELAY_IN_SECS = 3;
7	const DEBOUNCE_DELAY = DEBOUNCE_DELAY_IN_SECS * 1000;
8	const DEEPGRAM_API_KEY = process.env["DEEPGRAM_API_KEY"] \|\| "";
9
10	class TranscriptionService extends EventEmitter {
11	constructor(config, logger) {
12	super();
13	this.config = config;
14	this.logger = logger;
15	this.flowLogger = require("./fileLogger").createNamedLogger(
16	"transcriber-flow.log"
17	);
18	if (!DEEPGRAM_API_KEY) {
19	throw new Error("Missing Deepgram API Key");
20	}
21	this.deepgramClient = createClient(DEEPGRAM_API_KEY);
22	this.logger.logDetailed(
23	"INFO",
24	"Initializing Deepgram live connection",
25	"TranscriptionService",
26	{
27	model: "nova-2",
28	sample_rate: 16000,
29	channels: 2,
30	}
31	);
32	this.deepgramLive = this.deepgramClient.listen.live({
33	encoding: "linear16",
34	channels: 2,
35	sample_rate: 16000,
36	model: "nova-2",
37	smart_format: true,
38	interim_results: true,
39	endpointing: 800,
40	language: "en",
41	multichannel: true,
42	});
43	this.finalResult = { customer: "", assistant: "" };
44	this.audioBuffer = [];
45	this.retryAttempts = 0;
46	this.lastTranscriptionTime = Date.now();
47	this.pcmBuffer = Buffer.alloc(0);
48
49	this.deepgramLive.addListener(LiveTranscriptionEvents.Open, () => {
50	this.logger.logDetailed(
51	"INFO",
52	"Deepgram connection opened",
53	"TranscriptionService"
54	);
55	this.deepgramLive.on(LiveTranscriptionEvents.Close, () => {
56	this.logger.logDetailed(
57	"INFO",
58	"Deepgram connection closed",
59	"TranscriptionService"
60	);
61	this.emitTranscription();
62	this.audioBuffer = [];
63	});
64	this.deepgramLive.on(LiveTranscriptionEvents.Metadata, (data) => {
65	this.logger.logDetailed(
66	"DEBUG",
67	"Deepgram metadata received",
68	"TranscriptionService",
69	data
70	);
71	});
72	this.deepgramLive.on(LiveTranscriptionEvents.Transcript, (event) => {
73	this.handleTranscript(event);
74	});
75	this.deepgramLive.on(LiveTranscriptionEvents.Error, (err) => {
76	this.logger.logDetailed(
77	"ERROR",
78	"Deepgram error received",
79	"TranscriptionService",
80	{ error: err }
81	);
82	this.emit("transcriptionerror", err);
83	});
84	});
85	}
86
87	send(payload) {
88	if (payload instanceof Buffer) {
89	this.pcmBuffer =
90	this.pcmBuffer.length === 0
91	? payload
92	: Buffer.concat([this.pcmBuffer, payload]);
93	} else {
94	this.logger.warn("TranscriptionService: Received non-Buffer data chunk.");
95	}
96	if (this.deepgramLive.getReadyState() === 1 && this.pcmBuffer.length > 0) {
97	this.sendBufferedData(this.pcmBuffer);
98	this.pcmBuffer = Buffer.alloc(0);
99	}
100	}
101
102	sendBufferedData(bufferedData) {
103	try {
104	this.logger.logDetailed(
105	"INFO",
106	"Sending buffered data to Deepgram",
107	"TranscriptionService",
108	{ bytes: bufferedData.length }
109	);
110	this.deepgramLive.send(bufferedData);
111	this.audioBuffer = [];
112	this.retryAttempts = 0;
113	} catch (error) {
114	this.logger.logDetailed(
115	"ERROR",
116	"Error sending buffered data",
117	"TranscriptionService",
118	{ error }
119	);
120	this.retryAttempts++;
121	if (this.retryAttempts <= MAX_RETRY_ATTEMPTS) {
122	setTimeout(() => {
123	this.sendBufferedData(bufferedData);
124	}, 1000);
125	} else {
126	this.logger.logDetailed(
127	"ERROR",
128	"Max retry attempts reached, discarding data",
129	"TranscriptionService"
130	);
131	this.audioBuffer = [];
132	this.retryAttempts = 0;
133	}
134	}
135	}
136
137	handleTranscript(transcription) {
138	if (!transcription.channel \|\| !transcription.channel.alternatives?.[0]) {
139	this.logger.logDetailed(
140	"WARN",
141	"Invalid transcript format",
142	"TranscriptionService",
143	{ transcription }
144	);
145	return;
146	}
147	const text = transcription.channel.alternatives[0].transcript.trim();
148	if (!text) return;
149	const currentTime = Date.now();
150	const channelIndex = transcription.channel_index
151	? transcription.channel_index[0]
152	: 0;
153	const channel = channelIndex === 0 ? "customer" : "assistant";
154	this.logger.logDetailed(
155	"INFO",
156	"Received transcript",
157	"TranscriptionService",
158	{ channel, text }
159	);
160	if (transcription.is_final \|\| transcription.speech_final) {
161	this.finalResult[channel] += ` ${text}`;
162	this.emitTranscription();
163	} else {
164	this.finalResult[channel] += ` ${text}`;
165	if (currentTime - this.lastTranscriptionTime >= DEBOUNCE_DELAY) {
166	this.logger.logDetailed(
167	"INFO",
168	`Emitting transcript after ${DEBOUNCE_DELAY_IN_SECS}s inactivity`,
169	"TranscriptionService"
170	);
171	this.emitTranscription();
172	}
173	}
174	this.lastTranscriptionTime = currentTime;
175	}
176
177	emitTranscription() {
178	for (const chan of ["customer", "assistant"]) {
179	if (this.finalResult[chan].trim()) {
180	const transcript = this.finalResult[chan].trim();
181	this.logger.logDetailed(
182	"INFO",
183	"Emitting transcription",
184	"TranscriptionService",
185	{ channel: chan, transcript }
186	);
187	this.emit("transcription", transcript, chan);
188	this.finalResult[chan] = "";
189	}
190	}
191	}
192	}
193
194	module.exports = TranscriptionService;

1	const express = require("express");
2	const http = require("http");
3	const TranscriptionService = require("./transcriptionService");
4	const FileLogger = require("./fileLogger");
5	require("dotenv").config();
6
7	const app = express();
8	app.use(express.json());
9	app.use(express.urlencoded({ extended: true }));
10
11	app.get("/", (req, res) => {
12	res.send("Custom Transcriber Service is running");
13	});
14
15	const server = http.createServer(app);
16
17	const config = {
18	DEEPGRAM_API_KEY: process.env.DEEPGRAM_API_KEY,
19	PORT: process.env.PORT \|\| 3001,
20	};
21
22	const logger = new FileLogger();
23	const transcriptionService = new TranscriptionService(config, logger);
24
25	transcriptionService.setupWebSocketServer = function (server) {
26	const WebSocketServer = require("ws").Server;
27	const wss = new WebSocketServer({ server, path: "/api/custom-transcriber" });
28	wss.on("connection", (ws) => {
29	logger.logDetailed(
30	"INFO",
31	"New WebSocket client connected on /api/custom-transcriber",
32	"Server"
33	);
34	ws.on("message", (data, isBinary) => {
35	if (!isBinary) {
36	try {
37	const msg = JSON.parse(data.toString());
38	if (msg.type === "start") {
39	logger.logDetailed(
40	"INFO",
41	"Received start message from client",
42	"Server",
43	{ sampleRate: msg.sampleRate, channels: msg.channels }
44	);
45	}
46	} catch (err) {
47	logger.error("JSON parse error", err, "Server");
48	}
49	} else {
50	transcriptionService.send(data);
51	}
52	});
53	ws.on("close", () => {
54	logger.logDetailed("INFO", "WebSocket client disconnected", "Server");
55	if (
56	transcriptionService.deepgramLive &&
57	transcriptionService.deepgramLive.getReadyState() === 1
58	) {
59	transcriptionService.deepgramLive.finish();
60	}
61	});
62	ws.on("error", (error) => {
63	logger.error("WebSocket error", error, "Server");
64	});
65	transcriptionService.on("transcription", (text, channel) => {
66	const response = {
67	type: "transcriber-response",
68	transcription: text,
69	channel,
70	};
71	ws.send(JSON.stringify(response));
72	logger.logDetailed("INFO", "Sent transcription to client", "Server", {
73	channel,
74	text,
75	});
76	});
77	transcriptionService.on("transcriptionerror", (err) => {
78	ws.send(
79	JSON.stringify({ type: "error", error: "Transcription service error" })
80	);
81	logger.error("Transcription service error", err, "Server");
82	});
83	});
84	};
85
86	transcriptionService.setupWebSocketServer(server);
87
88	server.listen(config.PORT, () => {
89	console.log(`Server is running on http://localhost:${config.PORT}`);
90	});

$	curl -X POST https://api.vapi.ai/call \
>	-H "Authorization: Bearer YOUR_API_KEY" \
>	-H "Content-Type: application/json" \
>	-d '{
>	"phoneNumberId": "YOUR_PHONE_NUMBER_ID",
>	"customer": {
>	"number": "CUSTOMER_PHONE_NUMBER"
>	},
>	"assistant": {
>	"transcriber": {
>	"provider": "custom-transcriber",
>	"server": {
>	"url": "wss://your-server.ngrok.io/api/custom-transcriber"
>	},
>	"secret": "your_optional_secret_value"
>	},
>	"firstMessage": "Hello! I am using a custom transcriber with Deepgram."
>	},
>	"name": "CustomTranscriberTest"
>	}'