TTS-Engines¶

Referenz für die verschiedenen Sprachsynthese-Engines.

Basis-Schnittstelle¶

`text2speech.engines.base.TTSEngine` ¶

Bases: Protocol

Protocol for TTS engine implementations.

Source code in text2speech/engines/base.py

@runtime_checkable
class TTSEngine(Protocol):
    """Protocol for TTS engine implementations."""

    def synthesize(
        self, text: str, voice: Optional[str] = None, speed: float = 1.0
    ) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
        """Synthesize speech from text.

        Args:
            text (str): Text to synthesize.
            voice (Optional[str]): Voice identifier.
            speed (float): Speech speed multiplier.

        Yields:
            Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
                Tuples of (graphemes, phonemes, audio_tensor).
        """
        ...

`synthesize(text, voice=None, speed=1.0)` ¶

Synthesize speech from text.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to synthesize.	required
`voice`	`Optional[str]`	Voice identifier.	`None`
`speed`	`float`	Speech speed multiplier.	`1.0`

Yields:

Type	Description
`Tuple[Optional[str], Optional[str], Tensor]`	Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]: Tuples of (graphemes, phonemes, audio_tensor).

Source code in text2speech/engines/base.py

def synthesize(
    self, text: str, voice: Optional[str] = None, speed: float = 1.0
) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
    """Synthesize speech from text.

    Args:
        text (str): Text to synthesize.
        voice (Optional[str]): Voice identifier.
        speed (float): Speech speed multiplier.

    Yields:
        Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
            Tuples of (graphemes, phonemes, audio_tensor).
    """
    ...

Kokoro Engine¶

`text2speech.engines.kokoro.KokoroEngine` ¶

TTS engine using the Kokoro model.

Source code in text2speech/engines/kokoro.py

class KokoroEngine:
    """TTS engine using the Kokoro model."""

    def __init__(self, lang_code: str = "a"):
        """Initialize Kokoro engine.

        Args:
            lang_code (str): Language code for the pipeline.

        Raises:
            ImportError: If kokoro package is not installed.
            RuntimeError: If pipeline initialization fails.
        """
        if not HAS_KOKORO or KPipeline is None:
            raise ImportError("kokoro package is not installed")
        try:
            self.pipeline = KPipeline(lang_code=lang_code)
        except Exception as e:
            raise RuntimeError(f"Failed to initialize Kokoro pipeline: {e}")

    def synthesize(
        self, text: str, voice: Optional[str] = None, speed: float = 1.0
    ) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
        """Synthesize speech using Kokoro.

        Args:
            text (str): Text to synthesize.
            voice (Optional[str]): Voice identifier.
            speed (float): Speech speed multiplier.

        Yields:
            Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
                Tuples of (graphemes, phonemes, audio_tensor).
        """
        # Kokoro pipeline returns a generator
        generator = self.pipeline(text, voice=voice, speed=speed)
        for gs, ps, audio in generator:
            if not isinstance(audio, torch.Tensor):
                audio = torch.from_numpy(audio)
            yield gs, ps, audio

`init(lang_code='a')` ¶

Initialize Kokoro engine.

Parameters:

Name	Type	Description	Default
`lang_code`	`str`	Language code for the pipeline.	`'a'`

Raises:

Type	Description
`ImportError`	If kokoro package is not installed.
`RuntimeError`	If pipeline initialization fails.

Source code in text2speech/engines/kokoro.py

def __init__(self, lang_code: str = "a"):
    """Initialize Kokoro engine.

    Args:
        lang_code (str): Language code for the pipeline.

    Raises:
        ImportError: If kokoro package is not installed.
        RuntimeError: If pipeline initialization fails.
    """
    if not HAS_KOKORO or KPipeline is None:
        raise ImportError("kokoro package is not installed")
    try:
        self.pipeline = KPipeline(lang_code=lang_code)
    except Exception as e:
        raise RuntimeError(f"Failed to initialize Kokoro pipeline: {e}")

`synthesize(text, voice=None, speed=1.0)` ¶

Synthesize speech using Kokoro.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to synthesize.	required
`voice`	`Optional[str]`	Voice identifier.	`None`
`speed`	`float`	Speech speed multiplier.	`1.0`

Yields:

Type	Description
`Tuple[Optional[str], Optional[str], Tensor]`	Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]: Tuples of (graphemes, phonemes, audio_tensor).

Source code in text2speech/engines/kokoro.py

def synthesize(
    self, text: str, voice: Optional[str] = None, speed: float = 1.0
) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
    """Synthesize speech using Kokoro.

    Args:
        text (str): Text to synthesize.
        voice (Optional[str]): Voice identifier.
        speed (float): Speech speed multiplier.

    Yields:
        Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
            Tuples of (graphemes, phonemes, audio_tensor).
    """
    # Kokoro pipeline returns a generator
    generator = self.pipeline(text, voice=voice, speed=speed)
    for gs, ps, audio in generator:
        if not isinstance(audio, torch.Tensor):
            audio = torch.from_numpy(audio)
        yield gs, ps, audio

ElevenLabs Engine¶

`text2speech.engines.elevenlabs.ElevenLabsEngine` ¶

TTS engine using ElevenLabs API.

Source code in text2speech/engines/elevenlabs.py

class ElevenLabsEngine:
    """TTS engine using ElevenLabs API."""

    def __init__(self, api_key: str, model: str = "eleven_multilingual_v2"):
        """Initialize ElevenLabs engine.

        Args:
            api_key (str): ElevenLabs API key.
            model (str): Model identifier.

        Raises:
            ImportError: If elevenlabs package is not installed.
        """
        if not HAS_ELEVENLABS or ElevenLabs is None:
            raise ImportError("elevenlabs package is not installed")
        self.client = ElevenLabs(api_key=api_key)
        self.model = model

    def synthesize(
        self, text: str, voice: Optional[str] = None, speed: float = 1.0
    ) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
        """Synthesize speech using ElevenLabs.

        Args:
            text (str): Text to synthesize.
            voice (Optional[str]): Voice identifier.
            speed (float): Speech speed multiplier (currently ignored for ElevenLabs).

        Yields:
            Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
                Tuples of (graphemes, phonemes, audio_tensor).
        """
        client: Any = self.client
        audio_generator = client.generate(text=text, voice=voice or "Brian", model=self.model)

        if isinstance(audio_generator, bytes):
            audio_tensor = self._bytes_to_tensor(audio_generator)
            yield None, None, audio_tensor
        else:
            # Collect bytes from generator
            audio_bytes = b"".join(audio_generator)
            audio_tensor = self._bytes_to_tensor(audio_bytes)
            yield None, None, audio_tensor

    def _bytes_to_tensor(self, audio_bytes: bytes) -> torch.Tensor:
        """Convert audio bytes to torch Tensor.

        Args:
            audio_bytes (bytes): Raw audio data (typically MP3).

        Returns:
            torch.Tensor: 1D torch Tensor of audio waveform.
        """
        buffer = io.BytesIO(audio_bytes)
        try:
            waveform, _ = torchaudio.load(buffer)
            # Convert to mono if multi-channel
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)
            return cast(torch.Tensor, waveform.squeeze(0))
        except Exception as e:
            # If torchaudio fails (e.g. missing codec), this will raise
            raise RuntimeError(f"Failed to decode ElevenLabs audio: {e}")

`init(api_key, model='eleven_multilingual_v2')` ¶

Initialize ElevenLabs engine.

Parameters:

Name	Type	Description	Default
`api_key`	`str`	ElevenLabs API key.	required
`model`	`str`	Model identifier.	`'eleven_multilingual_v2'`

Raises:

Type	Description
`ImportError`	If elevenlabs package is not installed.

Source code in text2speech/engines/elevenlabs.py

def __init__(self, api_key: str, model: str = "eleven_multilingual_v2"):
    """Initialize ElevenLabs engine.

    Args:
        api_key (str): ElevenLabs API key.
        model (str): Model identifier.

    Raises:
        ImportError: If elevenlabs package is not installed.
    """
    if not HAS_ELEVENLABS or ElevenLabs is None:
        raise ImportError("elevenlabs package is not installed")
    self.client = ElevenLabs(api_key=api_key)
    self.model = model

`synthesize(text, voice=None, speed=1.0)` ¶

Synthesize speech using ElevenLabs.

Parameters:

Name	Type	Description	Default
`text`	`str`	Text to synthesize.	required
`voice`	`Optional[str]`	Voice identifier.	`None`
`speed`	`float`	Speech speed multiplier (currently ignored for ElevenLabs).	`1.0`

Yields:

Type	Description
`Tuple[Optional[str], Optional[str], Tensor]`	Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]: Tuples of (graphemes, phonemes, audio_tensor).

Source code in text2speech/engines/elevenlabs.py

def synthesize(
    self, text: str, voice: Optional[str] = None, speed: float = 1.0
) -> Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
    """Synthesize speech using ElevenLabs.

    Args:
        text (str): Text to synthesize.
        voice (Optional[str]): Voice identifier.
        speed (float): Speech speed multiplier (currently ignored for ElevenLabs).

    Yields:
        Iterator[Tuple[Optional[str], Optional[str], torch.Tensor]]:
            Tuples of (graphemes, phonemes, audio_tensor).
    """
    client: Any = self.client
    audio_generator = client.generate(text=text, voice=voice or "Brian", model=self.model)

    if isinstance(audio_generator, bytes):
        audio_tensor = self._bytes_to_tensor(audio_generator)
        yield None, None, audio_tensor
    else:
        # Collect bytes from generator
        audio_bytes = b"".join(audio_generator)
        audio_tensor = self._bytes_to_tensor(audio_bytes)
        yield None, None, audio_tensor

TTS-Engines¶

Basis-Schnittstelle¶

text2speech.engines.base.TTSEngine ¶

synthesize(text, voice=None, speed=1.0) ¶

Kokoro Engine¶

text2speech.engines.kokoro.KokoroEngine ¶

__init__(lang_code='a') ¶

synthesize(text, voice=None, speed=1.0) ¶

ElevenLabs Engine¶

text2speech.engines.elevenlabs.ElevenLabsEngine ¶

__init__(api_key, model='eleven_multilingual_v2') ¶

synthesize(text, voice=None, speed=1.0) ¶

`text2speech.engines.base.TTSEngine` ¶

`synthesize(text, voice=None, speed=1.0)` ¶

`text2speech.engines.kokoro.KokoroEngine` ¶

`init(lang_code='a')` ¶

`synthesize(text, voice=None, speed=1.0)` ¶

`text2speech.engines.elevenlabs.ElevenLabsEngine` ¶

`init(api_key, model='eleven_multilingual_v2')` ¶

`synthesize(text, voice=None, speed=1.0)` ¶