Zum Inhalt

TTS-Engines

Referenz für die verschiedenen Sprachsynthese-Engines.

Basis-Schnittstelle

text2speech.engines.base.TTSEngine

Bases: Protocol

Protocol for TTS engine implementations.

Source code in text2speech/engines/base.py
@runtime_checkable
class TTSEngine(Protocol):
    """Protocol for TTS engine implementations."""

    def synthesize(
        self, text: str, voice: Optional[str] = None, speed: float = 1.0
    ) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
        """Synthesize speech from text.

        Args:
            text (str): Text to synthesize.
            voice (Optional[str]): Voice identifier.
            speed (float): Speech speed multiplier.

        Yields:
            Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
                Tuples of (graphemes, phonemes, audio_tensor).
        """
        ...

synthesize(text, voice=None, speed=1.0)

Synthesize speech from text.

Parameters:

Name Type Description Default
text str

Text to synthesize.

required
voice Optional[str]

Voice identifier.

None
speed float

Speech speed multiplier.

1.0

Yields:

Type Description
Tuple[Optional[str], Optional[str], Tensor]

Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]: Tuples of (graphemes, phonemes, audio_tensor).

Source code in text2speech/engines/base.py
def synthesize(
    self, text: str, voice: Optional[str] = None, speed: float = 1.0
) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
    """Synthesize speech from text.

    Args:
        text (str): Text to synthesize.
        voice (Optional[str]): Voice identifier.
        speed (float): Speech speed multiplier.

    Yields:
        Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
            Tuples of (graphemes, phonemes, audio_tensor).
    """
    ...

Kokoro Engine

text2speech.engines.kokoro.KokoroEngine

TTS engine using the Kokoro model.

Source code in text2speech/engines/kokoro.py
class KokoroEngine:
    """TTS engine using the Kokoro model."""

    def __init__(self, lang_code: str = "a"):
        """Initialize Kokoro engine.

        Args:
            lang_code (str): Language code for the pipeline.

        Raises:
            ImportError: If kokoro package is not installed.
            RuntimeError: If pipeline initialization fails.
        """
        if not HAS_KOKORO or KPipeline is None:
            raise ImportError("kokoro package is not installed")
        try:
            self.pipeline = KPipeline(lang_code=lang_code)
        except Exception as e:
            raise RuntimeError(f"Failed to initialize Kokoro pipeline: {e}")

    def synthesize(
        self, text: str, voice: Optional[str] = None, speed: float = 1.0
    ) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
        """Synthesize speech using Kokoro.

        Args:
            text (str): Text to synthesize.
            voice (Optional[str]): Voice identifier.
            speed (float): Speech speed multiplier.

        Yields:
            Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
                Tuples of (graphemes, phonemes, audio_tensor).
        """
        # Kokoro pipeline returns a generator
        generator = self.pipeline(text, voice=voice, speed=speed)
        for gs, ps, audio in generator:
            if not isinstance(audio, torch.Tensor):
                audio = torch.from_numpy(audio)
            yield gs, ps, audio

__init__(lang_code='a')

Initialize Kokoro engine.

Parameters:

Name Type Description Default
lang_code str

Language code for the pipeline.

'a'

Raises:

Type Description
ImportError

If kokoro package is not installed.

RuntimeError

If pipeline initialization fails.

Source code in text2speech/engines/kokoro.py
def __init__(self, lang_code: str = "a"):
    """Initialize Kokoro engine.

    Args:
        lang_code (str): Language code for the pipeline.

    Raises:
        ImportError: If kokoro package is not installed.
        RuntimeError: If pipeline initialization fails.
    """
    if not HAS_KOKORO or KPipeline is None:
        raise ImportError("kokoro package is not installed")
    try:
        self.pipeline = KPipeline(lang_code=lang_code)
    except Exception as e:
        raise RuntimeError(f"Failed to initialize Kokoro pipeline: {e}")

synthesize(text, voice=None, speed=1.0)

Synthesize speech using Kokoro.

Parameters:

Name Type Description Default
text str

Text to synthesize.

required
voice Optional[str]

Voice identifier.

None
speed float

Speech speed multiplier.

1.0

Yields:

Type Description
Tuple[Optional[str], Optional[str], Tensor]

Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]: Tuples of (graphemes, phonemes, audio_tensor).

Source code in text2speech/engines/kokoro.py
def synthesize(
    self, text: str, voice: Optional[str] = None, speed: float = 1.0
) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
    """Synthesize speech using Kokoro.

    Args:
        text (str): Text to synthesize.
        voice (Optional[str]): Voice identifier.
        speed (float): Speech speed multiplier.

    Yields:
        Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
            Tuples of (graphemes, phonemes, audio_tensor).
    """
    # Kokoro pipeline returns a generator
    generator = self.pipeline(text, voice=voice, speed=speed)
    for gs, ps, audio in generator:
        if not isinstance(audio, torch.Tensor):
            audio = torch.from_numpy(audio)
        yield gs, ps, audio

ElevenLabs Engine

text2speech.engines.elevenlabs.ElevenLabsEngine

TTS engine using ElevenLabs API.

Source code in text2speech/engines/elevenlabs.py
class ElevenLabsEngine:
    """TTS engine using ElevenLabs API."""

    def __init__(self, api_key: str, model: str = "eleven_multilingual_v2"):
        """Initialize ElevenLabs engine.

        Args:
            api_key (str): ElevenLabs API key.
            model (str): Model identifier.

        Raises:
            ImportError: If elevenlabs package is not installed.
        """
        if not HAS_ELEVENLABS or ElevenLabs is None:
            raise ImportError("elevenlabs package is not installed")
        self.client = ElevenLabs(api_key=api_key)
        self.model = model

    def synthesize(
        self, text: str, voice: Optional[str] = None, speed: float = 1.0
    ) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
        """Synthesize speech using ElevenLabs.

        Args:
            text (str): Text to synthesize.
            voice (Optional[str]): Voice identifier.
            speed (float): Speech speed multiplier (currently ignored for ElevenLabs).

        Yields:
            Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
                Tuples of (graphemes, phonemes, audio_tensor).
        """
        client: Any = self.client
        audio_generator = client.generate(text=text, voice=voice or "Brian", model=self.model)

        if isinstance(audio_generator, bytes):
            audio_tensor = self._bytes_to_tensor(audio_generator)
            yield None, None, audio_tensor
        else:
            # Collect bytes from generator
            audio_bytes = b"".join(audio_generator)
            audio_tensor = self._bytes_to_tensor(audio_bytes)
            yield None, None, audio_tensor

    def _bytes_to_tensor(self, audio_bytes: bytes) -> torch.Tensor:
        """Convert audio bytes to torch Tensor.

        Args:
            audio_bytes (bytes): Raw audio data (typically MP3).

        Returns:
            torch.Tensor: 1D torch Tensor of audio waveform.
        """
        buffer = io.BytesIO(audio_bytes)
        try:
            waveform, _ = torchaudio.load(buffer)
            # Convert to mono if multi-channel
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)
            return cast(torch.Tensor, waveform.squeeze(0))
        except Exception as e:
            # If torchaudio fails (e.g. missing codec), this will raise
            raise RuntimeError(f"Failed to decode ElevenLabs audio: {e}")

__init__(api_key, model='eleven_multilingual_v2')

Initialize ElevenLabs engine.

Parameters:

Name Type Description Default
api_key str

ElevenLabs API key.

required
model str

Model identifier.

'eleven_multilingual_v2'

Raises:

Type Description
ImportError

If elevenlabs package is not installed.

Source code in text2speech/engines/elevenlabs.py
def __init__(self, api_key: str, model: str = "eleven_multilingual_v2"):
    """Initialize ElevenLabs engine.

    Args:
        api_key (str): ElevenLabs API key.
        model (str): Model identifier.

    Raises:
        ImportError: If elevenlabs package is not installed.
    """
    if not HAS_ELEVENLABS or ElevenLabs is None:
        raise ImportError("elevenlabs package is not installed")
    self.client = ElevenLabs(api_key=api_key)
    self.model = model

synthesize(text, voice=None, speed=1.0)

Synthesize speech using ElevenLabs.

Parameters:

Name Type Description Default
text str

Text to synthesize.

required
voice Optional[str]

Voice identifier.

None
speed float

Speech speed multiplier (currently ignored for ElevenLabs).

1.0

Yields:

Type Description
Tuple[Optional[str], Optional[str], Tensor]

Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]: Tuples of (graphemes, phonemes, audio_tensor).

Source code in text2speech/engines/elevenlabs.py
def synthesize(
    self, text: str, voice: Optional[str] = None, speed: float = 1.0
) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
    """Synthesize speech using ElevenLabs.

    Args:
        text (str): Text to synthesize.
        voice (Optional[str]): Voice identifier.
        speed (float): Speech speed multiplier (currently ignored for ElevenLabs).

    Yields:
        Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
            Tuples of (graphemes, phonemes, audio_tensor).
    """
    client: Any = self.client
    audio_generator = client.generate(text=text, voice=voice or "Brian", model=self.model)

    if isinstance(audio_generator, bytes):
        audio_tensor = self._bytes_to_tensor(audio_generator)
        yield None, None, audio_tensor
    else:
        # Collect bytes from generator
        audio_bytes = b"".join(audio_generator)
        audio_tensor = self._bytes_to_tensor(audio_bytes)
        yield None, None, audio_tensor