Speech Synthesis

This document provides a detailed explanation of the speech synthesis functions in the AI backend. These functions are responsible for converting text into speech using different services.

synthesize_speech_polly

The synthesize_speech_polly function synthesizes speech from text using the AWS Polly service.

def synthesize_speech_polly(text: str, target_language: str, voice_id: str) -> bytes:
    """Synthesize speech using AWS Polly"""
    polly_client = boto3.client(
        'polly',
        region_name='us-west-2',
        aws_access_key_id='AKIA4MTWHYPUQPV3OP6A',
        aws_secret_access_key='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'
    )
    
    # Map language codes to Polly language codes
    lang_map = {
        "es-ES": "es-ES", "fr-FR": "fr-FR", "de-DE": "de-DE", "it-IT": "it-IT",
        "pt-BR": "pt-BR", "ja-JP": "ja-JP", "ko-KR": "ko-KR", "zh-CN": "cmn-CN",
        "ar-SA": "arb", "en": "en-US", "en-GB": "en-GB", "ru-RU": "ru-RU",
        "nl-NL": "nl-NL", "sv-SE": "sv-SE", "da-DK": "da-DK", "no-NO": "nb-NO",
        "fi-FI": "fi-FI", "pl-PL": "pl-PL"
    }
    
    # Voices that require the neural engine
    neural_voices = {
        "Sergio", "Enrique", "Conchita", "Lucia", "Lupe", "Penelope", "Miguel",  # Spanish
        "Lea", "Remy", "Mathieu", "Celine",  # French
        "Daniel", "Vicki", "Hans", "Marlene",  # German
        "Bianca", "Giorgio", "Carla",  # Italian
        "Thiago", "Camila", "Ricardo", "Vitoria",  # Portuguese
        "Matthew", "Joanna", "Kendra", "Kimberly", "Salli", "Joey", "Justin", "Ivy",  # English US
        "Arthur", "Amy", "Brian", "Emma",  # English GB
        "Takumi", "Mizuki", "Seoyeon", "Tomoko",  # Japanese/Korean
        "Zhiyu", "Kangkang",  # Chinese
        "Zeina", "Hala"  # Arabic
    }
    
    polly_lang = lang_map.get(target_language, "en-US")
    
    try:
        # Try neural engine first for better quality, fall back to standard if it fails
        engine = "neural" if voice_id in neural_voices else "standard"
        
        try:
            response = polly_client.synthesize_speech(
                Text=text,
                OutputFormat='mp3',
                VoiceId=voice_id,
                LanguageCode=polly_lang,
                Engine=engine
            )
        except Exception as neural_error:
            if "neural" in str(neural_error).lower() and engine == "neural":
                print(f"Neural engine failed for voice {voice_id}, falling back to standard engine")
                response = polly_client.synthesize_speech(
                    Text=text,
                    OutputFormat='mp3',
                    VoiceId=voice_id,
                    LanguageCode=polly_lang,
                    Engine="standard"
                )
            else:
                raise neural_error
        
        # Convert MP3 to WAV using pydub
        audio_data = response['AudioStream'].read()
        audio_segment = AudioSegment.from_mp3(io.BytesIO(audio_data))
        
        # Export as WAV
        wav_buffer = io.BytesIO()
        audio_segment.export(wav_buffer, format="wav")
        return wav_buffer.getvalue()
        
    except Exception as e:
        print(f"AWS Polly TTS failed: {e}")
        raise

Parameters:

text: The text to synthesize.
target_language: The target language for the speech.
voice_id: The voice to use for the synthesis.

Returns:

The synthesized audio data in WAV format.

synthesize_speech_sarvam

The synthesize_speech_sarvam function synthesizes speech from text using the Sarvam AI service, with support for chunking long text.

def synthesize_speech_sarvam(text: str, target_language_code: str, sarvam_client, 
                           speaker: str = "abhilash", pitch: float = 0.0, 
                           pace: float = 1.0, loudness: float = 1.0, 
                           sample_rate: int = 16000) -> bytes:
    """Synthesize speech using Sarvam AI with chunking"""
    chunks = chunk_text(text, TTS_MAX_CHARS)
    print(f"TTS chunks: {len(chunks)}")

    if len(chunks) == 1:
        audio = sarvam_client.text_to_speech.convert(
            text=chunks[0],
            model="bulbul:v2",
            speaker=speaker.lower(),
            pitch=pitch,
            target_language_code=target_language_code,
            pace=pace,
            loudness=loudness,
            speech_sample_rate=sample_rate,
            enable_preprocessing=False
        )
        return base64.b64decode("".join(audio.audios))
    
    # Handle multiple chunks with merging
    audio_segments = []

    for i, chunk in enumerate(chunks):
        try:
            audio = sarvam_client.text_to_speech.convert(
                text=chunk,
                model="bulbul:v2",
                speaker=speaker.lower(),
                pitch=pitch,
                target_language_code=target_language_code,
                pace=pace,
                loudness=loudness,
                speech_sample_rate=sample_rate,
                enable_preprocessing=False
            )

            audio_data = base64.b64decode("".join(audio.audios))

            try:
                segment = AudioSegment.from_wav(io.BytesIO(audio_data))
            except Exception:
                try:
                    segment = AudioSegment.from_mp3(io.BytesIO(audio_data))
                except Exception:
                    segment = AudioSegment.from_raw(
                        io.BytesIO(audio_data),
                        sample_width=2,
                        frame_rate=sample_rate,
                        channels=1
                    )

            audio_segments.append(segment)
            print(f"TTS Chunk {i+1} processed — {len(segment)} ms")

        except Exception as e:
            print(f"TTS Chunk {i+1} failed: {e}")
            audio_segments.append(AudioSegment.silent(duration=1000))

    if not audio_segments:
        raise Exception("Failed to generate audio")

    # Merge all audio segments
    final_audio = audio_segments[0]
    for seg in audio_segments[1:]:
        final_audio += AudioSegment.silent(duration=200)  # 200ms gap between chunks
        final_audio += seg

    # Export as WAV
    out_buffer = io.BytesIO()
    final_audio.export(out_buffer, format="wav")
    return out_buffer.getvalue()

Parameters:

text: The text to synthesize.
target_language_code: The target language code.
sarvam_client: An instance of the Sarvam AI client.
speaker: The speaker to use for the synthesis.
pitch: The pitch of the synthesized speech.
pace: The pace of the synthesized speech.
loudness: The loudness of the synthesized speech.
sample_rate: The sample rate of the synthesized speech.

Returns:

The synthesized audio data in WAV format.

Speech Synthesis

synthesize_speech_polly

synthesize_speech_sarvam

On this page