Source code for pylips.speech.system_tts

import pyttsx3
from allosaurus.app import read_recognizer
import soundfile as sf
import pickle
import os

import warnings

warnings.filterwarnings(
    "ignore",
    category=FutureWarning,
    message=r"You are using `torch\.load` with `weights_only=False`.*"
)
from sys import platform

[docs]class SystemTTS: ''' A text-to-speech backend that uses the system's default TTS engine. args: None '''
[docs] def __init__(self): self.engine = pyttsx3.init() self.engine.setProperty('rate', 120) self.model = read_recognizer() self.voices = [voice.id for voice in self.engine.getProperty('voices')] if platform == "linux" or platform == "linux2": # linux requires special considerations for espeak-ng result = os.popen('espeak-ng --voices').read() self.voices = [line[4:20].strip() for line in result.split('\n')[1:-1]]
[docs] def list_voices(self): ''' Lists all the voices that are available in the system's default TTS backend. args: None ''' for i, voice in enumerate(self.voices): print(f'{i}: {voice}')
[docs] def gen_audio_and_visemes(self, text, voice_id=None, fname=None): ''' Generates audio and visemes from a string of text using the system's default TTS engine. args: text (str): the text that the robot should speak voice_id (str): the voice that the robot should speak in fname (str): the name of the file that the audio should be saved to returns: (tuple): a tuple containing ``fname``, ``times``, and ``visemes``. fname is the path to the audio file, times is a list of times that correspond to the initiation of the visemes, and visemes is a list of visemes that correspond to the words in the audio raises: Exception: if the voice_id is not in the list of available voices ''' if voice_id is None: voice_id = 'en' if platform == "linux" or platform == "linux2" else 'default' elif type(voice_id) == int and voice_id < len(self.voices): voice_id = self.voices[voice_id] elif voice_id not in self.voices: raise Exception(f'voice "{voice_id}" does not exist') else: self.engine.setProperty('voice', voice_id) if fname is None: fname = f"pylips_phrases/{voice_id}_output.wav" else: #if it was already cached, return it, otherwise, generate it and return it fname = f"pylips_phrases/{fname}.wav" if os.path.exists(fname): times, visemes = pickle.load(open(f'{fname[:-4]}.pkl', 'rb')) return fname, times, visemes # Synthesize speech if platform == "linux" or platform == "linux2": # linux has issues with saving multiple files... os.system(f"espeak-ng -v {voice_id} -s 100 '{text}' -w {fname}") else: self.engine.save_to_file(text, fname) self.engine.runAndWait() data, samplerate = sf.read(fname) sf.write(fname, data, samplerate) #synthesize visemes out = self.model.recognize(fname, timestamp=True, lang_id='eng') times = [i.split(' ')[0] for i in out.split('\n')] visemes = [IPA2VISEME[i.split(' ')[-1]] for i in out.split('\n')] times.append(len(data)/samplerate + 0.2) visemes.append('IDLE') pickle.dump((times, visemes), open(f'{fname[:-4]}.pkl', 'wb')) return fname, times, visemes
[docs] def get_audio_and_visemes(self, fname): ''' Gets the audio and visemes from a file that was previously generated. args: fname (str): the name of the file that the audio and visemes were saved to. It is not necessary to include the file extension. returns: (tuple): a tuple containing ``fname``, ``times``, and ``visemes``. fname is the path to the audio file, times is a list of times that correspond to the initiation of the visemes, and visemes is a list of visemes that correspond to the words in the audio ''' #if it was already cached, return it, otherwise, raise an error fname = f"pylips_phrases/{fname}.wav" if os.path.exists(fname): times, visemes = pickle.load(open(f'{fname[:-4]}.pkl', 'rb')) return fname, times, visemes else: raise Exception(f'phrase {fname} does not exist')
IPA2VISEME = { 'sil': 'IDLE', '': 'IDLE', 'k͡p̚': 'BILABIAL', 'm': 'BILABIAL', 'b': 'BILABIAL', 'p': 'BILABIAL', 'pʰ': 'BILABIAL', 'v': 'LABIODENTAL', 'f': 'LABIODENTAL', 'θ': 'INTERDENTAL', 'ð': 'INTERDENTAL', 'l': 'DENTAL_ALVEOLAR', 'd': 'DENTAL_ALVEOLAR', 't': 'DENTAL_ALVEOLAR', 'tʰ': 'DENTAL_ALVEOLAR', 't̠': 'DENTAL_ALVEOLAR', 'n': 'DENTAL_ALVEOLAR', 'ɳ': 'DENTAL_ALVEOLAR', 's': 'DENTAL_ALVEOLAR', 'z': 'DENTAL_ALVEOLAR', 'ʃ': 'POSTALVEOLAR', 'ʒ': 'POSTALVEOLAR', 'ɹ̩': 'POSTALVEOLAR', 'ɹ': 'POSTALVEOLAR', 'r': 'POSTALVEOLAR', 'ɻ': 'POSTALVEOLAR', 'ɾ': 'POSTALVEOLAR', 'dʒ': 'POSTALVEOLAR', 'tʃ': 'POSTALVEOLAR', 't͡ʃʲ': 'POSTALVEOLAR', 'ij': 'POSTALVEOLAR', 'tɕʰ': 'POSTALVEOLAR', 'x': 'POSTALVEOLAR', 'd̠': 'POSTALVEOLAR', 'h': 'VELAR_GLOTTAL', 'k': 'VELAR_GLOTTAL', 'kʰ': 'VELAR_GLOTTAL', 'g': 'VELAR_GLOTTAL', 'ɡ': 'VELAR_GLOTTAL', 'ŋ': 'VELAR_GLOTTAL', 'h': 'VELAR_GLOTTAL', 'ʔ': 'VELAR_GLOTTAL', 'ɪ': 'CLOSE_FRONT_VOWEL', 'I': 'CLOSE_FRONT_VOWEL', 'iː': 'CLOSE_FRONT_VOWEL', 'ɪ̯': 'CLOSE_FRONT_VOWEL', 'j': 'CLOSE_FRONT_VOWEL', 'e': 'CLOSE_FRONT_VOWEL', 'i': 'CLOSE_FRONT_VOWEL', 'eː': 'CLOSE_FRONT_VOWEL', 'e̞': 'CLOSE_FRONT_VOWEL', 'øː': 'CLOSE_FRONT_VOWEL', 'ɛ': 'OPEN_FRONT_VOWEL', 'a': 'OPEN_FRONT_VOWEL', 'æ': 'OPEN_FRONT_VOWEL', 'ɛː': 'OPEN_FRONT_VOWEL', 'aː': 'OPEN_FRONT_VOWEL', 'ə': 'MID_CENTRAL_VOWEL', 'ɚ': 'MID_CENTRAL_VOWEL', 'ɐ': 'MID_CENTRAL_VOWEL', 'ɐː': 'MID_CENTRAL_VOWEL', 'ɘ': 'MID_CENTRAL_VOWEL', 'əː': 'MID_CENTRAL_VOWEL', 'ɜː': 'MID_CENTRAL_VOWEL', 'ɵː': 'MID_CENTRAL_VOWEL', 'w': "CLOSE_BACK_VOWEL", 'ʊ': 'CLOSE_BACK_VOWEL', 'u': 'CLOSE_BACK_VOWEL', 'uː': 'CLOSE_BACK_VOWEL', 'ʉ': 'CLOSE_BACK_VOWEL', 'ʉː': 'CLOSE_BACK_VOWEL', 'ɯ': 'CLOSE_BACK_VOWEL', 'ʍ': 'CLOSE_BACK_VOWEL', 'o': 'OPEN_BACK_VOWEL', 'oː': 'OPEN_BACK_VOWEL', 'ɔ': 'OPEN_BACK_VOWEL', 'ɔː': 'OPEN_BACK_VOWEL', 'ɑ': 'OPEN_BACK_VOWEL', 'ɑː': 'OPEN_BACK_VOWEL', 'ɒː': 'OPEN_BACK_VOWEL', 'ɒ': 'OPEN_BACK_VOWEL', 'ʌ': 'OPEN_BACK_VOWEL', }