"""Speech Synthesis related modeling class"""
from typing import Optional, Tuple
from numpy import ndarray
from pororo.tasks import (
PororoFactoryBase,
PororoG2pFactory,
PororoSimpleBase,
download_or_load,
)
[docs]class PororoTtsFactory(PororoFactoryBase):
"""
Synthesis text to speech using trained model
Output audio's sample rate is 22050
Multi (`tacotron`)
- dataset: TBU
- metric: TBU
Args:
text (str): text for speech synthesis
lang (str): text's language Ex) how are you?: en, 안녕하세요.: ko
speaker (str): designate a speaker such as ko, en, zh etc.. (default: lang)
Returns:
ndarray: waveform of speech signal
Examples:
>>> import IPython
>>> from IPython.display import Audio
>>> model = Pororo(task="tts", lang="multi")
>>> # Typical TTS
>>> wave = model("how are you?", lang="en")
>>> IPython.display.display(IPython.display.Audio(data=wave, rate=22050))
>>> # Voice Style Transfer
>>> model = Pororo(task="tts", lang="multi")
>>> wave = model("저는 미국 사람이에요.", lang="ko", speaker="en")
>>> IPython.display.display(IPython.display.Audio(data=wave, rate=22050))
>>> # Code-Switching
>>> wave = model("저는 미국 사람이에요.", lang="ko", speaker="en-15,ko")
>>> IPython.display.Audio(data=wave, rate=22050)
Notes:
Currently 11 languages supports.
Supported Languages: English, Korean, Japanese, Chinese, Jejueo, Dutch, German, Spanish, French, Russian, Finnish
This task can designate a speaker such as ko, en, zh etc.
"""
def __init__(
self,
task: str,
lang: str = "multi",
model: Optional[str] = None,
):
super().__init__(task, lang, model)
[docs] @staticmethod
def get_available_langs():
return ["multi"]
[docs] @staticmethod
def get_available_models():
return {
"multi": ["tacotron"],
}
[docs] def load(self, device: str):
"""
Load user-selected task-specific model
Args:
device (str): device information
Returns:
object: User-selected task-specific model
"""
if self.config.n_model == "tacotron":
from pororo.models.tts.synthesizer import (
MultilingualSpeechSynthesizer,
)
from pororo.models.tts.utils.numerical_pinyin_converter import (
convert_from_numerical_pinyin,
)
from pororo.models.tts.utils.text import jejueo_romanize, romanize
tacotron_path = download_or_load("misc/tacotron2", self.config.lang)
english_vocoder_path = download_or_load(
"misc/hifigan_en",
self.config.lang,
)
korean_vocoder_path = download_or_load(
"misc/hifigan_ko",
self.config.lang,
)
english_vocoder_config = download_or_load(
"misc/hifigan_en_config.json",
self.config.lang,
)
korean_vocoder_config = download_or_load(
"misc/hifigan_ko_config.json",
self.config.lang,
)
wavernn_path = download_or_load(
"misc/wavernn.pyt",
self.config.lang,
)
synthesizer = MultilingualSpeechSynthesizer(
tacotron_path,
english_vocoder_path,
english_vocoder_config,
korean_vocoder_path,
korean_vocoder_config,
wavernn_path,
device,
self.config.lang,
)
return PororoTTS(
synthesizer,
device,
romanize,
jejueo_romanize,
convert_from_numerical_pinyin,
self.config,
)
[docs]class PororoTTS(PororoSimpleBase):
def __init__(
self,
synthesizer,
device,
romanize,
jejueo_romanize,
convert_from_numerical_pinyin,
config,
):
super().__init__(config)
self._synthesizer = synthesizer
self.g2p_ja = None
self.g2p_zh = None
self.lang_dict = {
"en": "en",
"ko": "ko",
"ja": "jp",
"de": "de",
"nl": "nl",
"ru": "ru",
"es": "es",
"fr": "fr",
"zh": "zh",
"fi": "fi",
"je": "je",
}
self.device = device
self.romanize = romanize
self.jejueo_romanize = jejueo_romanize
self.convert_from_numerical_pinyin = convert_from_numerical_pinyin
def _load_g2p_ja(self):
"""Load g2p module for Japanese"""
self.g2p_ja = PororoG2pFactory(
task="g2p",
model="g2p.ja",
lang="ja",
).load(self.device)
def _load_g2p_zh(self):
"""Load g2p module for Chinese"""
self.g2p_zh = PororoG2pFactory(
task="g2p",
model="g2p.zh",
lang="zh",
).load(self.device)
def _preprocess(
self,
text: str,
lang: str = "en",
speaker: str = "en",
) -> Tuple[str, str]:
"""
Pre-process text for TTS format
Args:
text (str): text for tts
lang (str): text language
speaker (speaker): designation of speaker
Returns:
str: pre-processed text
"""
if lang == "ko":
text = self.romanize(text)
elif lang == "ja":
if self.g2p_ja is None:
self._load_g2p_ja()
text = self.g2p_ja(text)
elif lang == "zh":
if self.g2p_zh is None:
self._load_g2p_zh()
text = self.g2p_zh(text).replace(" ", " ")
text = self.convert_from_numerical_pinyin(text)
elif lang == "je":
text = self.jejueo_romanize(text)
return f"{text}|00-{self.lang_dict[lang]}|{speaker}", speaker
[docs] def predict(self, text: str, speaker: str) -> ndarray:
"""
Conduct speech synthesis on given text
Args:
text (str): text for tts
speaker (speaker): designation of speaker
Returns:
ndarray: waveform of speech signal
"""
return self._synthesizer.predict(text, speaker)
def __call__(self, text: str, lang: str = "en", speaker: str = None):
if speaker is None:
speaker = lang
speaker = self.lang_dict[speaker]
text, speaker = self._preprocess(text, lang, speaker)
return self.predict(text, speaker)