Source code for pororo.tasks.speech_synthesis

"""Speech Synthesis related modeling class"""

from typing import Optional, Tuple

from numpy import ndarray

from pororo.tasks import (
    PororoFactoryBase,
    PororoG2pFactory,
    PororoSimpleBase,
    download_or_load,
)


[docs]class PororoTtsFactory(PororoFactoryBase):
    """
    Synthesis text to speech using trained model
    Output audio's sample rate is 22050

    Multi (`tacotron`)

        - dataset: TBU
        - metric: TBU

    Args:
        text (str): text for speech synthesis
        lang (str): text's language Ex) how are you?: en, 안녕하세요.: ko
        speaker (str): designate a speaker such as ko, en, zh etc.. (default: lang)

    Returns:
        ndarray: waveform of speech signal

    Examples:
        >>> import IPython
        >>> from IPython.display import Audio
        >>> model = Pororo(task="tts", lang="multi")
        >>> # Typical TTS
        >>> wave = model("how are you?", lang="en")
        >>> IPython.display.display(IPython.display.Audio(data=wave, rate=22050))
        >>> # Voice Style Transfer
        >>> model = Pororo(task="tts", lang="multi")
        >>> wave = model("저는 미국 사람이에요.", lang="ko", speaker="en")
        >>> IPython.display.display(IPython.display.Audio(data=wave, rate=22050))
        >>> # Code-Switching
        >>> wave = model("저는 미국 사람이에요.", lang="ko", speaker="en-15,ko")
        >>> IPython.display.Audio(data=wave, rate=22050)


    Notes:
        Currently 11 languages supports.
        Supported Languages: English, Korean, Japanese, Chinese, Jejueo, Dutch, German, Spanish, French, Russian, Finnish
        This task can designate a speaker such as ko, en, zh etc.

    """

    def __init__(
        self,
        task: str,
        lang: str = "multi",
        model: Optional[str] = None,
    ):
        super().__init__(task, lang, model)

[docs]    @staticmethod
    def get_available_langs():
        return ["multi"]

[docs]    @staticmethod
    def get_available_models():
        return {
            "multi": ["tacotron"],
        }

[docs]    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if self.config.n_model == "tacotron":
            from pororo.models.tts.synthesizer import (
                MultilingualSpeechSynthesizer,
            )
            from pororo.models.tts.utils.numerical_pinyin_converter import (
                convert_from_numerical_pinyin,
            )
            from pororo.models.tts.utils.text import jejueo_romanize, romanize

            tacotron_path = download_or_load("misc/tacotron2", self.config.lang)
            english_vocoder_path = download_or_load(
                "misc/hifigan_en",
                self.config.lang,
            )
            korean_vocoder_path = download_or_load(
                "misc/hifigan_ko",
                self.config.lang,
            )
            english_vocoder_config = download_or_load(
                "misc/hifigan_en_config.json",
                self.config.lang,
            )
            korean_vocoder_config = download_or_load(
                "misc/hifigan_ko_config.json",
                self.config.lang,
            )
            wavernn_path = download_or_load(
                "misc/wavernn.pyt",
                self.config.lang,
            )
            synthesizer = MultilingualSpeechSynthesizer(
                tacotron_path,
                english_vocoder_path,
                english_vocoder_config,
                korean_vocoder_path,
                korean_vocoder_config,
                wavernn_path,
                device,
                self.config.lang,
            )
            return PororoTTS(
                synthesizer,
                device,
                romanize,
                jejueo_romanize,
                convert_from_numerical_pinyin,
                self.config,
            )


[docs]class PororoTTS(PororoSimpleBase):

    def __init__(
        self,
        synthesizer,
        device,
        romanize,
        jejueo_romanize,
        convert_from_numerical_pinyin,
        config,
    ):
        super().__init__(config)
        self._synthesizer = synthesizer

        self.g2p_ja = None
        self.g2p_zh = None

        self.lang_dict = {
            "en": "en",
            "ko": "ko",
            "ja": "jp",
            "de": "de",
            "nl": "nl",
            "ru": "ru",
            "es": "es",
            "fr": "fr",
            "zh": "zh",
            "fi": "fi",
            "je": "je",
        }
        self.device = device

        self.romanize = romanize
        self.jejueo_romanize = jejueo_romanize
        self.convert_from_numerical_pinyin = convert_from_numerical_pinyin

    def _load_g2p_ja(self):
        """Load g2p module for Japanese"""
        self.g2p_ja = PororoG2pFactory(
            task="g2p",
            model="g2p.ja",
            lang="ja",
        ).load(self.device)

    def _load_g2p_zh(self):
        """Load g2p module for Chinese"""
        self.g2p_zh = PororoG2pFactory(
            task="g2p",
            model="g2p.zh",
            lang="zh",
        ).load(self.device)

    def _preprocess(
        self,
        text: str,
        lang: str = "en",
        speaker: str = "en",
    ) -> Tuple[str, str]:
        """
        Pre-process text for TTS format

        Args:
            text (str): text for tts
            lang (str): text language
            speaker (speaker): designation of speaker

        Returns:
            str: pre-processed text

        """
        if lang == "ko":
            text = self.romanize(text)
        elif lang == "ja":
            if self.g2p_ja is None:
                self._load_g2p_ja()
            text = self.g2p_ja(text)
        elif lang == "zh":
            if self.g2p_zh is None:
                self._load_g2p_zh()
            text = self.g2p_zh(text).replace("   ", " ")
            text = self.convert_from_numerical_pinyin(text)
        elif lang == "je":
            text = self.jejueo_romanize(text)
        return f"{text}|00-{self.lang_dict[lang]}|{speaker}", speaker

[docs]    def predict(self, text: str, speaker: str) -> ndarray:
        """
        Conduct speech synthesis on given text

        Args:
            text (str): text for tts
            speaker (speaker): designation of speaker

        Returns:
             ndarray: waveform of speech signal

        """
        return self._synthesizer.predict(text, speaker)

    def __call__(self, text: str, lang: str = "en", speaker: str = None):
        if speaker is None:
            speaker = lang

        speaker = self.lang_dict[speaker]

        text, speaker = self._preprocess(text, lang, speaker)
        return self.predict(text, speaker)