Real-time Synthesis
VORA's real-time synthesis capabilities enable instant voice generation with sub-100ms latency, perfect for conversational AI, live applications, and interactive experiences that require immediate voice responses.
Generate and stream voice audio in real-time with industry-leading latency performance for natural conversations.
Real-time synthesis with VORA provides:
Sub-100ms latency for immediate voice responses
Streaming audio generation for continuous playback
Dynamic emotion control during synthesis
Interrupt handling for natural conversation flow
Edge deployment for offline real-time processing
import sagea
import asyncio
# Initialize client for real-time synthesis
client = sagea.VoraClient( api_key = "your-api-key" )
# Basic real-time synthesis
async def real_time_example ():
# Stream voice synthesis
async for chunk in client.stream(
text = "Hello! This is real-time voice synthesis." ,
model = "vora-l1" , # Optimized for low latency
emotion = "friendly"
):
# Play audio chunk immediately
await play_audio_chunk(chunk)
asyncio.run(real_time_example())
Generate and play audio as it's synthesized:
# Stream long-form content
async def stream_content ():
text = """
Welcome to our real-time voice synthesis demonstration.
This content is being generated and streamed as you hear it,
with minimal latency for a natural listening experience.
"""
async for chunk in client.stream(
text = text,
model = "vora-l1" ,
chunk_size = 1024 , # Audio chunk size
emotion = "conversational"
):
# Process chunk in real-time
audio_buffer.append(chunk)
await play_if_ready()
For web applications requiring real-time interaction:
import websockets
import json
async def websocket_handler (websocket, path):
async for message in websocket:
data = json.loads(message)
# Stream synthesis results back
async for chunk in client.stream(
text = data[ 'text' ],
model = "vora-e0" , # Ultra-fast model
emotion = data.get( 'emotion' , 'neutral' )
):
await websocket.send(chunk.to_bytes())
# Start WebSocket server
start_server = websockets.serve(websocket_handler, "localhost" , 8765 )
// Client-side real-time synthesis
class RealTimeVORA {
constructor ( apiKey ) {
this .ws = new WebSocket ( 'wss://api.sagea.space/v1/vora/stream' );
this .audioContext = new AudioContext ();
this . setupWebSocket ();
}
setupWebSocket () {
this .ws. onmessage = ( event ) => {
// Play audio chunk immediately
this . playAudioChunk (event.data);
};
}
synthesize ( text , options = {}) {
this .ws. send ( JSON . stringify ({
text: text,
model: 'vora-l1' ,
... options
}));
}
async playAudioChunk ( audioData ) {
const audioBuffer = await this .audioContext. decodeAudioData (audioData);
const source = this .audioContext. createBufferSource ();
source.buffer = audioBuffer;
source. connect ( this .audioContext.destination);
source. start ();
}
}
// Usage
const vora = new RealTimeVORA ( 'your-api-key' );
vora. synthesize ( 'Hello! This is real-time synthesis.' , {
emotion: 'excited'
});
Build responsive voice assistants:
class VoiceAssistant :
def __init__ (self):
self .vora = sagea.VoraClient( api_key = "your-api-key" )
self .sage = sagea.ChatClient( api_key = "your-api-key" )
self .conversation_state = []
async def handle_voice_input (self, audio_input):
# Convert speech to text (external service)
user_text = await speech_to_text(audio_input)
# Get AI response
response = await self .sage.chat(
messages = self .conversation_state + [
{ "role" : "user" , "content" : user_text}
],
model = "sage-mini" # Fast response
)
# Stream voice response immediately
async for chunk in self .vora.stream(
text = response.content,
model = "vora-e0" ,
emotion = "helpful"
):
await self .play_audio(chunk)
# Update conversation state
self .conversation_state.extend([
{ "role" : "user" , "content" : user_text},
{ "role" : "assistant" , "content" : response.content}
])
# Usage
assistant = VoiceAssistant()
await assistant.handle_voice_input(user_audio)
Handle conversation interruptions naturally:
class InterruptibleSynthesis :
def __init__ (self):
self .current_stream = None
self .is_speaking = False
async def speak (self, text, emotion = "neutral" ):
# Stop current speech if interrupted
if self .current_stream:
await self .current_stream.stop()
self .is_speaking = True
self .current_stream = client.stream(
text = text,
model = "vora-e0" ,
emotion = emotion
)
try :
async for chunk in self .current_stream:
if not self .is_speaking: # Check for interruption
break
await self .play_audio(chunk)
finally :
self .is_speaking = False
async def interrupt (self):
"""Stop current speech for user input"""
self .is_speaking = False
if self .current_stream:
await self .current_stream.stop()
Change emotions during synthesis:
async def dynamic_emotion_demo ():
# Create emotion sequence
emotion_timeline = [
{ "start" : 0.0 , "emotion" : "neutral" },
{ "start" : 2.0 , "emotion" : "excited" },
{ "start" : 4.0 , "emotion" : "calm" },
{ "start" : 6.0 , "emotion" : "professional" }
]
text = "This demonstration shows how emotions can change dynamically during speech synthesis."
# Stream with emotion changes
stream = client.create_emotion_stream(
text = text,
model = "vora-v1" ,
emotion_timeline = emotion_timeline
)
async for chunk in stream:
# Each chunk includes current emotion metadata
print ( f "Playing: { chunk.emotion } at { chunk.timestamp } s" )
await play_audio_chunk(chunk)
Adapt emotions based on conversation context:
class EmotionalSynthesis :
def __init__ (self):
self .emotion_history = []
self .user_sentiment = "neutral"
def analyze_context (self, user_input, ai_response):
"""Determine appropriate emotion based on context"""
if "thank you" in user_input.lower():
return "warm"
elif "problem" in user_input.lower() or "error" in user_input.lower():
return "helpful"
elif "excited" in user_input.lower():
return "enthusiastic"
else :
return "professional"
async def contextual_synthesis (self, text, user_input):
emotion = self .analyze_context(user_input, text)
async for chunk in client.stream(
text = text,
model = "vora-v1" ,
emotion = emotion,
emotion_intensity = 0.7
):
await play_audio_chunk(chunk)
self .emotion_history.append(emotion)
Choose the right model for your latency requirements:
# Ultra-low latency (10-20ms)
ultra_fast = client.stream(
text = "Instant response" ,
model = "vora-e0" ,
quality = "standard"
)
# Balanced latency/quality (40-60ms)
balanced = client.stream(
text = "Good quality with fast response" ,
model = "vora-l1" ,
quality = "high"
)
# High quality (80-120ms)
premium = client.stream(
text = "Studio quality with real-time capability" ,
model = "vora-v1" ,
quality = "premium"
)
Optimize for repeated content:
# Cache frequently used phrases
cache = {}
async def cached_synthesis (text, emotion = "neutral" ):
cache_key = f " { text } : { emotion } "
if cache_key in cache:
# Play cached audio immediately
await play_audio(cache[cache_key])
else :
# Generate and cache
audio_chunks = []
async for chunk in client.stream(
text = text,
model = "vora-l1" ,
emotion = emotion
):
audio_chunks.append(chunk)
await play_audio_chunk(chunk)
cache[cache_key] = b '' .join(audio_chunks)
# Preload common responses
common_phrases = [
"Hello, how can I help you?" ,
"I understand your concern." ,
"Let me help you with that." ,
"Thank you for your patience."
]
for phrase in common_phrases:
await cached_synthesis(phrase, "helpful" )
Manage audio buffers for smooth playback:
class AudioBuffer :
def __init__ (self, max_size = 10 ):
self .buffer = []
self .max_size = max_size
self .playing = False
async def add_chunk (self, chunk):
self .buffer.append(chunk)
if not self .playing:
await self .start_playback()
async def start_playback (self):
self .playing = True
while self .buffer:
chunk = self .buffer.pop( 0 )
await play_audio_chunk(chunk)
await asyncio.sleep( 0.01 ) # Small delay for smooth playback
self .playing = False
# Usage with streaming
buffer = AudioBuffer()
async for chunk in client.stream( text = "Buffered playback" , model = "vora-l1" ):
await buffer.add_chunk(chunk)
Deploy VORA models locally for minimal latency:
# Initialize edge client
edge_client = sagea.VoraEdgeClient(
model_path = "./vora-l1-edge" ,
device = "cuda" # or "cpu"
)
# Real-time synthesis without internet
async def offline_synthesis (text):
async for chunk in edge_client.stream(
text = text,
emotion = "neutral"
):
await play_audio_chunk(chunk) # <10ms latency
Integrate with mobile applications:
// iOS Swift example
import VORASDK
class RealTimeVoice {
private let vora = VORAClient ()
func streamSynthesis ( text : String , emotion : String = "neutral" ) {
vora. stream (
text : text,
model : .voraL1,
emotion : emotion
) { audioChunk in
// Play audio chunk immediately
self . playAudio (audioChunk)
}
}
}
Handle conversations with multiple speakers:
async def multi_speaker_conversation ():
speakers = {
"assistant" : { "voice" : "default" , "emotion" : "helpful" },
"narrator" : { "voice" : "custom_narrator" , "emotion" : "professional" }
}
conversation = [
{ "speaker" : "assistant" , "text" : "Hello! How can I help you today?" },
{ "speaker" : "narrator" , "text" : "The user is asking about our services." },
{ "speaker" : "assistant" , "text" : "I'd be happy to explain our offerings." }
]
for turn in conversation:
speaker_config = speakers[turn[ "speaker" ]]
async for chunk in client.stream(
text = turn[ "text" ],
voice = speaker_config[ "voice" ],
emotion = speaker_config[ "emotion" ],
model = "vora-v1"
):
await play_audio_chunk(chunk)
Automatically adjust quality based on network conditions:
class AdaptiveQuality :
def __init__ (self):
self .current_quality = "high"
self .latency_history = []
def adjust_quality (self, latency):
self .latency_history.append(latency)
avg_latency = sum ( self .latency_history[ - 10 :]) / 10
if avg_latency > 200 :
self .current_quality = "standard"
elif avg_latency > 100 :
self .current_quality = "high"
else :
self .current_quality = "premium"
async def adaptive_synthesis (self, text):
start_time = time.time()
async for chunk in client.stream(
text = text,
model = "vora-l1" ,
quality = self .current_quality
):
await play_audio_chunk(chunk)
latency = (time.time() - start_time) * 1000
self .adjust_quality(latency)
High Latency:
Use VORA-E0 or VORA-L1 models
Enable edge deployment
Reduce audio quality if needed
Implement proper buffering
Audio Dropouts:
Increase buffer size
Check network stability
Use WebSocket connections
Implement retry logic
Quality Degradation:
Monitor network conditions
Use adaptive quality settings
Cache common phrases
Optimize chunk sizes
class PerformanceMonitor :
def __init__ (self):
self .metrics = {
"latency" : [],
"quality_score" : [],
"network_errors" : 0
}
def log_synthesis (self, start_time, end_time, quality):
latency = (end_time - start_time) * 1000
self .metrics[ "latency" ].append(latency)
self .metrics[ "quality_score" ].append(quality)
print ( f "Synthesis latency: { latency :.2f } ms" )
print ( f "Average latency: { np.mean( self .metrics[ 'latency' ]) :.2f } ms" )
Real-time commentary and narration for live events
Dynamic character voices that respond instantly to player actions
Real-time text-to-speech for live conversations and presentations
Instant voice responses in call centers and chat applications
Interactive tutoring with immediate voice feedback
π Emotion Control Add emotional expression to real-time synthesis
Learn More β π§ API Reference Complete streaming API documentation
View Docs β
Real-time synthesis with VORA enables truly conversational AI experiences with natural, immediate voice responses that feel alive and engaging.