Source code for pylips.speech.system_tts

import pyttsx3
from allosaurus.app import read_recognizer
import soundfile as sf
import pickle
import os

import warnings

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=r"You are using `torch\.load` with `weights_only=False`.*"
)
from sys import platform

[docs]class SystemTTS:
    '''
    A text-to-speech backend that uses the system's default TTS engine.

    args:
        None
    '''
[docs]    def __init__(self):
        self.engine = pyttsx3.init()
        self.engine.setProperty('rate', 120)
        self.model = read_recognizer()
        self.voices = [voice.id for voice in self.engine.getProperty('voices')]

        if platform == "linux" or platform == "linux2":
            # linux requires special considerations for espeak-ng
            result = os.popen('espeak-ng --voices').read()
            self.voices = [line[4:20].strip() for line in result.split('\n')[1:-1]]
            
[docs]    def list_voices(self):
        '''
        Lists all the voices that are available in the system's default TTS backend.
        
        args:
            None
        '''
        for i, voice in enumerate(self.voices):
            print(f'{i}: {voice}')

[docs]    def gen_audio_and_visemes(self, text, voice_id=None, fname=None):
        '''
        Generates audio and visemes from a string of text using the system's default TTS engine.

        args:
            text (str): the text that the robot should speak
            voice_id (str): the voice that the robot should speak in
            fname (str): the name of the file that the audio should be saved to
        
        returns:
            (tuple): a tuple containing ``fname``, ``times``, and ``visemes``. fname is 
            the path to the audio file, times is a list of times that correspond to the initiation
            of the visemes, and visemes is a list of visemes that correspond to the words in the audio 

        raises:
            Exception: if the voice_id is not in the list of available voices
        '''
        if voice_id is None:
            voice_id = 'en' if platform == "linux" or platform == "linux2" else 'default'
        
        elif type(voice_id) == int and voice_id < len(self.voices):
            voice_id = self.voices[voice_id]

        elif voice_id not in self.voices:
            raise Exception(f'voice "{voice_id}" does not exist')
        else:
            self.engine.setProperty('voice', voice_id)

        if fname is None:   
            fname = f"pylips_phrases/{voice_id}_output.wav"
        else:
            #if it was already cached, return it, otherwise, generate it and return it
            fname = f"pylips_phrases/{fname}.wav"
            if os.path.exists(fname):
                times, visemes = pickle.load(open(f'{fname[:-4]}.pkl', 'rb'))
                return fname, times, visemes

        # Synthesize speech
        if platform == "linux" or platform == "linux2":
            # linux has issues with saving multiple files...
            os.system(f"espeak-ng -v {voice_id} -s 100 '{text}' -w {fname}")
        else:
            self.engine.save_to_file(text, fname)
            self.engine.runAndWait()

        data, samplerate = sf.read(fname)
        sf.write(fname, data, samplerate)

        #synthesize visemes
        out = self.model.recognize(fname, timestamp=True, lang_id='eng')

        times = [i.split(' ')[0] for i in out.split('\n')]
        visemes = [IPA2VISEME[i.split(' ')[-1]] for i in out.split('\n')]


        times.append(len(data)/samplerate + 0.2)
        visemes.append('IDLE')

        pickle.dump((times, visemes), open(f'{fname[:-4]}.pkl', 'wb'))

        return fname, times, visemes
    
[docs]    def get_audio_and_visemes(self, fname):
        '''
        Gets the audio and visemes from a file that was previously generated.

        args:
            fname (str): the name of the file that the audio and visemes were saved to. It is not
                necessary to include the file extension.
        
        returns:
            (tuple): a tuple containing ``fname``, ``times``, and ``visemes``. fname is 
            the path to the audio file, times is a list of times that correspond to the initiation
            of the visemes, and visemes is a list of visemes that correspond to the words in the audio 
        '''
        #if it was already cached, return it, otherwise, raise an error
        fname = f"pylips_phrases/{fname}.wav"

        if os.path.exists(fname):
            times, visemes = pickle.load(open(f'{fname[:-4]}.pkl', 'rb'))
            return fname, times, visemes
        else:
            raise Exception(f'phrase {fname} does not exist')
        

IPA2VISEME = {
    'sil': 'IDLE',
    '': 'IDLE',
    
    'k͡p̚': 'BILABIAL', 
    'm': 'BILABIAL',
    'b': 'BILABIAL',
    'p': 'BILABIAL',
    'pʰ': 'BILABIAL',


    'v': 'LABIODENTAL',
    'f': 'LABIODENTAL',

    'θ': 'INTERDENTAL',
    'ð': 'INTERDENTAL',

    'l': 'DENTAL_ALVEOLAR',
    'd': 'DENTAL_ALVEOLAR',
    't': 'DENTAL_ALVEOLAR',
    'tʰ': 'DENTAL_ALVEOLAR',
    't̠': 'DENTAL_ALVEOLAR',
    'n': 'DENTAL_ALVEOLAR',
    'ɳ': 'DENTAL_ALVEOLAR',
    's': 'DENTAL_ALVEOLAR',
    'z': 'DENTAL_ALVEOLAR',
    
    'ʃ': 'POSTALVEOLAR',
    'ʒ': 'POSTALVEOLAR',
    'ɹ̩': 'POSTALVEOLAR',
    'ɹ': 'POSTALVEOLAR',
    'r': 'POSTALVEOLAR',
    'ɻ': 'POSTALVEOLAR',
    'ɾ': 'POSTALVEOLAR',
    'dʒ': 'POSTALVEOLAR',
    'tʃ': 'POSTALVEOLAR',
    't͡ʃʲ': 'POSTALVEOLAR',
    'ij': 'POSTALVEOLAR',
    'tɕʰ': 'POSTALVEOLAR',
    'x': 'POSTALVEOLAR',
    'd̠': 'POSTALVEOLAR',

    'h': 'VELAR_GLOTTAL',
    'k': 'VELAR_GLOTTAL',
    'kʰ': 'VELAR_GLOTTAL',
    'g': 'VELAR_GLOTTAL',
    'ɡ': 'VELAR_GLOTTAL',
    'ŋ': 'VELAR_GLOTTAL',
    'h': 'VELAR_GLOTTAL',
    'ʔ': 'VELAR_GLOTTAL',

    'ɪ': 'CLOSE_FRONT_VOWEL',
    'I': 'CLOSE_FRONT_VOWEL',
    'iː': 'CLOSE_FRONT_VOWEL',
    'ɪ̯': 'CLOSE_FRONT_VOWEL',
    'j': 'CLOSE_FRONT_VOWEL',
    'e': 'CLOSE_FRONT_VOWEL',
    'i': 'CLOSE_FRONT_VOWEL',
    'eː': 'CLOSE_FRONT_VOWEL',
    'e̞': 'CLOSE_FRONT_VOWEL',
    'øː': 'CLOSE_FRONT_VOWEL',

    'ɛ': 'OPEN_FRONT_VOWEL',
    'a': 'OPEN_FRONT_VOWEL',
    'æ': 'OPEN_FRONT_VOWEL',
    'ɛː': 'OPEN_FRONT_VOWEL',
    'aː': 'OPEN_FRONT_VOWEL',

    'ə': 'MID_CENTRAL_VOWEL',
    'ɚ': 'MID_CENTRAL_VOWEL',
    'ɐ': 'MID_CENTRAL_VOWEL',
    'ɐː': 'MID_CENTRAL_VOWEL',
    'ɘ': 'MID_CENTRAL_VOWEL',
    'əː': 'MID_CENTRAL_VOWEL',
    'ɜː': 'MID_CENTRAL_VOWEL',
    'ɵː': 'MID_CENTRAL_VOWEL',
    
    'w': "CLOSE_BACK_VOWEL",
    'ʊ': 'CLOSE_BACK_VOWEL',
    'u': 'CLOSE_BACK_VOWEL',
    'uː': 'CLOSE_BACK_VOWEL',
    'ʉ': 'CLOSE_BACK_VOWEL',
    'ʉː': 'CLOSE_BACK_VOWEL',
    'ɯ': 'CLOSE_BACK_VOWEL',
    'ʍ': 'CLOSE_BACK_VOWEL',

    'o': 'OPEN_BACK_VOWEL',
    'oː': 'OPEN_BACK_VOWEL',
    'ɔ': 'OPEN_BACK_VOWEL',
    'ɔː': 'OPEN_BACK_VOWEL',
    'ɑ': 'OPEN_BACK_VOWEL',
    'ɑː': 'OPEN_BACK_VOWEL',
    'ɒː': 'OPEN_BACK_VOWEL',
    'ɒ': 'OPEN_BACK_VOWEL',
    'ʌ': 'OPEN_BACK_VOWEL',   
}