feat: add XTTS v2 voice generator with clone source management

2026-04-10 01:25:18 -04:00
parent ee08a1ffd3
commit 68a1d143e8
2 changed files with 163 additions and 0 deletions
@@ -0,0 +1,73 @@
 """XTTS v2 wrapper for voice cloning from non-voice audio samples."""
 import os
 import random
 import tempfile
 from pathlib import Path
 try:
    from TTS.api import TTS
 except ImportError:
    TTS = None  # Tests patch this; real runtime requires the TTS package
 from server.config import config
 class VoiceGenerator:
    """Generates speech cloned from arbitrary audio samples via XTTS v2."""
    def __init__(
        self,
        device: str | None = None,
        model_name: str | None = None,
        samples_dir: str | None = None,
    ):
        self.device = device or config.device
        self.model_name = model_name or config.models.xtts_model
        self.samples_dir = Path(samples_dir or config.samples_dir)
        if TTS is None:
            raise RuntimeError(
                "TTS package is not installed; cannot instantiate VoiceGenerator"
            )
        self._tts = TTS(model_name=self.model_name)
        self._tts.to(self.device)
    def generate(self, text: str, speaker_wav: str | None = None) -> bytes:
        """Generate speech as WAV bytes. Uses a random clone source if none specified."""
        if speaker_wav is None:
            speaker_wav = self.random_clone_source()
        if speaker_wav is None:
            raise ValueError("No speaker WAV provided and no samples available")
        # XTTS writes to file, so use a temp file
        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        tmp.close()
        try:
            self._tts.tts_to_file(
                text=text,
                speaker_wav=speaker_wav,
                language=config.models.xtts_language,
                file_path=tmp.name,
            )
            with open(tmp.name, "rb") as f:
                return f.read()
        finally:
            try:
                os.unlink(tmp.name)
            except OSError:
                pass
    def list_clone_sources(self) -> list[str]:
        """List all WAV files in the samples directory."""
        if not self.samples_dir.is_dir():
            return []
        return [
            str(p) for p in sorted(self.samples_dir.glob("*.wav"))
        ]
    def random_clone_source(self) -> str | None:
        """Pick a random clone source WAV file."""
        sources = self.list_clone_sources()
        if not sources:
            return None
        return random.choice(sources)
@@ -0,0 +1,90 @@
 import os
 import tempfile
 import wave
 from unittest.mock import MagicMock, patch
 from server.voice_generator import VoiceGenerator
 class TestVoiceGenerator:
    @patch("server.voice_generator.TTS")
    def test_init_loads_model(self, mock_tts_cls):
        """Generator loads the XTTS v2 model on init."""
        mock_tts = MagicMock()
        mock_tts_cls.return_value = mock_tts
        gen = VoiceGenerator(device="cpu")
        mock_tts_cls.assert_called_once()
    @patch("server.voice_generator.TTS")
    def test_generate_returns_wav_bytes(self, mock_tts_cls):
        """Generate returns WAV bytes."""
        mock_tts = MagicMock()
        mock_tts_cls.return_value = mock_tts
        # Create a real WAV file for the mock to "produce"
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            tmp_wav = f.name
            with wave.open(f, "wb") as wf:
                wf.setnchannels(1)
                wf.setsampwidth(2)
                wf.setframerate(22050)
                wf.writeframes(b"\x00\x00" * 22050)  # 1 second of silence
        try:
            # Mock tts_to_file to copy our test WAV
            def fake_tts_to_file(text, speaker_wav, language, file_path):
                import shutil
                shutil.copy2(tmp_wav, file_path)
            mock_tts.tts_to_file = fake_tts_to_file
            gen = VoiceGenerator(device="cpu")
            data = gen.generate("hello", speaker_wav=tmp_wav)
            assert isinstance(data, bytes)
            assert len(data) > 0
        finally:
            os.unlink(tmp_wav)
    @patch("server.voice_generator.TTS")
    def test_list_clone_sources(self, mock_tts_cls):
        """Lists available clone source files."""
        mock_tts = MagicMock()
        mock_tts_cls.return_value = mock_tts
        with tempfile.TemporaryDirectory() as samples_dir:
            # Create some fake sample files
            for name in ["dog.wav", "machine.wav", "wind.wav"]:
                with open(os.path.join(samples_dir, name), "wb") as f:
                    f.write(b"fake")
            gen = VoiceGenerator(device="cpu", samples_dir=samples_dir)
            sources = gen.list_clone_sources()
            assert len(sources) == 3
            assert all(s.endswith(".wav") for s in sources)
    @patch("server.voice_generator.TTS")
    def test_random_clone_source(self, mock_tts_cls):
        """Picks a random clone source from samples directory."""
        mock_tts = MagicMock()
        mock_tts_cls.return_value = mock_tts
        with tempfile.TemporaryDirectory() as samples_dir:
            for name in ["a.wav", "b.wav", "c.wav"]:
                with open(os.path.join(samples_dir, name), "wb") as f:
                    f.write(b"fake")
            gen = VoiceGenerator(device="cpu", samples_dir=samples_dir)
            source = gen.random_clone_source()
            assert source is not None
            assert source.endswith(".wav")
    @patch("server.voice_generator.TTS")
    def test_empty_samples_dir(self, mock_tts_cls):
        """Empty samples dir returns None for random source."""
        mock_tts = MagicMock()
        mock_tts_cls.return_value = mock_tts
        with tempfile.TemporaryDirectory() as samples_dir:
            gen = VoiceGenerator(device="cpu", samples_dir=samples_dir)
            assert gen.random_clone_source() is None