Source code for pylips.speech.polly_tts

import boto3
import json
import os
from contextlib import closing
import pickle

[docs]class PollyTTS:
    '''
    A text-to-speech backend that uses Amazon Polly.

    This class is used to generate audio files from text using Amazon Polly. It can also generate visemes
    that correspond to the audio files that it generates.

    args:
        None
    '''

[docs]    def __init__(self):
        self.tts = boto3.client('polly')

        # all current amazon polly voices
        self.voices = ['Zeina','Hala','Zayd','Lisa','Arlet','Hiujin','Zhiyu','Naja','Mads',
                       'Sofie','Laura','Lotte','Ruben','Nicole','Olivia','Russell','Amy','Emma',
                       'Brian','Arthur','Aditi','Raveena','Kajal','Niamh','Aria','Ayanda','Danielle',
                       'Gregory','Ivy','Joanna','Kendra','Kimberly','Salli','Joey','Justin','Kevin',
                       'Matthew','Ruth','Stephen','Geraint','Suvi','Celine','Léa','Mathieu','Rémi',
                       'Isabelle','Chantal','Gabrielle','Liam','Marlene','Vicki','Hans','Daniel',
                       'Hannah','Aditi','Kajal','Dora','Karl','Carla','Bianca','Giorgio','Adriano',
                       'Mizuki','Takumi','Kazuha','Tomoko','Seoyeon','Liv','Ida','Ewa','Maja','Jacek',
                       'Jan','Ola','Camila','Vitoria','Ricardo','Thiago','Ines','Cristiano','Carmen',
                       'Tatyana','Maxim','Conchita','Lucia','Enrique','Sergio','Mia','Andrés','Lupe',
                       'Penelope','Miguel','Pedro','Astrid','Elin','Filiz','Burcu','Gwyneth']

[docs]    def list_voices(self):
        '''
        Lists all the voices that are available in the Amazon Polly TTS backend.

        For a more in-depth look at the voices, see the `Amazon Polly documentation 
        <https://docs.aws.amazon.com/polly/latest/dg/voicelist.html>`_.

        args:
            None
        '''
        for i, voice in enumerate(self.voices):
            print(f'{i}: {voice}')

[docs]    def gen_audio_and_visemes(self, text, voice_id=None, fname=None):
        '''
        Generates an audio file and visemes from a string of text using Amazon Polly.

        args:
            text (str): the text that the robot should speak
            voice_id (str): the voice that the robot should speak in
            fname (str): the name of the file that the audio should be saved to

        returns:
            (tuple): a tuple containing ``fname``, ``times``, and ``visemes``. fname is 
            the path to the audio file, times is a list of times that correspond to the initiation
            of the visemes, and visemes is a list of visemes that correspond to the words in the audio 
        '''
        if voice_id is None:
            voice_id = 'Justin'
            
        if fname is None:   
            fname = f"pylips_phrases/{voice_id}_output.mp3"
        else:
            #if it was already cached, return it, otherwise, generate it and return it
            fname = f"pylips_phrases/{fname}.mp3"
            if os.path.exists(fname):
                times, visemes = pickle.load(open(f'{fname[:-4]}.pkl', 'rb'))
                return fname, times, visemes

        # Synthesize speech
        response = self.tts.synthesize_speech(
            TextType='ssml',
            Text=f"<speak>{text}</speak>",
            OutputFormat='mp3',
            VoiceId=voice_id
        )

        audio_stream = response["AudioStream"]

        if not os.path.isdir('pylips_phrases'):
            os.mkdir('pylips_phrases')
             
        with open(fname, "wb") as file:
            file.write(audio_stream.read())

        # Synthesize visemes
        # Synthesize speech with viseme output
        response = self.tts.synthesize_speech(
            TextType='ssml',
            Text=f"<speak>{text}</speak>",
            OutputFormat='json',
            SpeechMarkTypes=['viseme'],
            VoiceId=voice_id
        )
        
         # Process the response
        if 'AudioStream' in response:
            # Do something with the audio stream if needed
            with closing(response["AudioStream"]) as stream:
                        data = stream.read().decode('utf-8')
                        xSheet = data.split('\n')
                        xSheet = [json.loads(line) for line in xSheet if line != '']

        times = [x['time'] / 1000. for x in xSheet]
        visemes = [VIS2IPA[x['value']] for x in xSheet]

        pickle.dump((times, visemes), open(f'{fname[:-4]}.pkl', 'wb'))

        return fname, times, visemes
    
[docs]    def get_audio_and_visemes(self, fname):
        '''
        Loads presaved audio and visemes from a file.

        args:
            fname (str): the name of the file that the audio and visemes were saved to
        
        returns:
            (tuple): a tuple containing ``fname``, ``times``, and ``visemes``. fname is 
            the path to the audio file, times is a list of times that correspond to the initiation
            of the visemes, and visemes is a list of visemes that correspond to the words in the audio 
 
        
        raises:
            Exception: if the file does not exist
        '''
        #if it was already cached, return it, otherwise, raise an error
        fname = f"pylips_phrases/{fname}.mp3"

        if os.path.exists(fname):
            times, visemes = pickle.load(open(f'{fname[:-4]}.pkl', 'rb'))
            return fname, times, visemes
        else:
            raise Exception(f'phrase {fname} does not exist')
        
        
VIS2IPA = {"p": "BILABIAL",
            "B": "BILABIAL",
            "f": "LABIODENTAL",
            "T": "INTERDENTAL",
            "s": "DENTAL_ALVEOLAR",
            "t": "DENTAL_ALVEOLAR",
            "S": "POSTALVEOLAR",
            "r": "POSTALVEOLAR",
            "J": "VELAR_GLOTTAL",
            "k": "VELAR_GLOTTAL",

            "i": "CLOSE_FRONT_VOWEL",
            "u": "CLOSE_BACK_VOWEL",
            "@": "MID_CENTRAL_VOWEL",
            "a": "OPEN_FRONT_VOWEL",
            "e": "OPEN_FRONT_VOWEL",
            "E": "OPEN_FRONT_VOWEL",
            "o": "OPEN_BACK_VOWEL",
            "O": "OPEN_BACK_VOWEL",

            "sil": "IDLE"}