Real Time development
This guide will help you get started transcribing audio in real time using a websocket connection. Full documentation for the Real Time API can be found on our Real Time API documentation page.
In order to interact with the any of the scribe APIs, you will need an access token. To get an access token, visit our authentication guide.
The Real Time API provides a way to send data from a single PCM-encoded audio file and get the transcribed text in a single websocket connection. Audio must be broken up and sent to Scribe in at most 15 second segments.
Here is example Python code that can be used to send audio data, less than 15 seconds long, to Scribe for transcription:
import asyncio
import json
import websockets
url = "wss://scribe.kensho.com/ws"
token = "<YOUR ACCESS TOKEN HERE>"
def _handle_response(response, expected_message):
"""Ensures that we get the expected response back from Scribe"""
# If our response contains the message we expect then we're good!
if response["message"] == expected_message:
return
# Otherwise - something bad happened
if response["message"] == "Error":
raise Exception("An error occurred '{}'".format(response["reason"]))
else:
raise Exception("Unexpected response message '{}'".format(response["message"]))
async def _send_data(websocket, pcm_encoded_audio):
seq_num = 0
await websocket.send(json.dumps({
"message": "AddData",
"audio": pcm_encoded_audio,
"sequence_number": seq_num,
}))
response = json.loads(await websocket.recv())
_handle_response(response, "DataAdded")
# Tell Scribe we are done sending data
await websocket.send(json.dumps({"message": "EndOfStream", "last_sequence_number": seq_num}))
async def transcribe(pcm_encoded_audio):
"""Transcribe PCM audio into text"""
async with websockets.connect(url) as websocket:
# Step 1 - authenticate
await websocket.send(json.dumps({
"message": "Authenticate",
"token": token,
}))
response = json.loads(await websocket.recv())
_handle_response(response, "Authenticated")
# Step 2 - tell the server we want to start a transcription for audio
# of a specified format
await websocket.send(json.dumps({
"message": "StartTranscription",
"audio_format": {
"type": "RAW",
"encoding": "pcm_s16le",
"sample_rate_hz": 16000,
"num_channels": 1,
},
}))
response = json.loads(await websocket.recv())
_handle_response(response, "TranscriptionStarted")
print("Transcribing request ID '{}'".format(response["request_id"]))
# Step 3 - send the audio data to the server
await _send_data(websocket, pcm_encoded_audio)
# Step 4 - wrap things up!
response = json.loads(await websocket.recv())
while response["message"] != "EndOfTranscript":
if response["message"] == "AddTranscript":
print("Transcribed '{}'".format(response["transcript"]["transcript"]))
response = json.loads(await websocket.recv())
To transcribe audio in real time that is already in the PCM format you can read it in from the file and directly send it to Scribe.
Here is an example in Python using the previous transcribe
function:
import asyncio
import base64
with open("/path/to/audio.wav", "rb") as f:
pcm_encoded_audio = base64.b64encode(f.read()).decode("ascii")
asyncio.get_event_loop().run_until_complete(transcribe(pcm_encoded_audio))
To transcribe audio in real time from a format that is not in PCM you can use a third party library to first convert the audio.
Here is an example in Python using the pydub (opens in a new tab) library
along with the previous transcribe
function:
import asyncio
import base64
import io
from pydub import AudioSegment
# Open the MP3 file and convert it to signed 16-bit little endian format PCM with a bitrate
# of 16000. The format and bitrate for the conversion need to match the transcription
# properties we pass to scribe. Do the conversion (export) in memory so that we can turn
# around and encode it as base64 to send to Scribe
buf = io.BytesIO()
audio = AudioSegment.from_mp3("/path/to/audio.mp3")
audio.set_frame_rate(16000).export(buf, format="wav", codec="pcm_s16le")
pcm_encoded_audio = base64.b64encode(buf.getvalue()).decode("ascii")
asyncio.get_event_loop().run_until_complete(transcribe(pcm_encoded_audio))
To transcribe audio in real time that is longer than 15 seconds you can use a third party library to break up the audio and send it in pieces.
Here is a simple example in Python using the pydub (opens in a new tab) library
using the same transcribe
function as before, but with a modified _send_data
to send
data in 1 second chunks:
import asyncio
import base64
import io
import json
import websockets
from pydub import AudioSegment
# The size of an audio chunk in milliseconds
AUDIO_SIZE = 1 * 1000
async def _send_data(websocket, audio):
seq_num = 0
buf = io.BytesIO()
audio = audio.set_frame_rate(16000)
while audio.duration_seconds > 0:
# Send a new chunk of audio for transcription
audio[:AUDIO_SIZE - 1].export(buf, format="wav", codec="pcm_s16le")
await websocket.send(json.dumps({
"message": "AddData",
"audio": base64.b64encode(buf.getvalue()).decode("ascii"),
"sequence_number": seq_num,
}))
seq_num += 1
# Move forward in the audio stream
audio = audio[AUDIO_SIZE:]
# Tell Scribe we are done sending data
await websocket.send(json.dumps({"message": "EndOfStream", "last_sequence_number": seq_num - 1}))
async def transcribe(audio):
# Same as in the example above
...
audio = AudioSegment.from_mp3("/path/to/audio.mp3")
asyncio.get_event_loop().run_until_complete(transcribe(audio))